From 561bfb690998c04134b84cba5720a274a0e0e087 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Wed, 15 Jan 2025 16:27:45 +0000 Subject: [PATCH 1/4] Give julia-actions/cache requisite permissions --- .github/workflows/preview.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/preview.yml b/.github/workflows/preview.yml index 16942f92c..1d01f8a88 100644 --- a/.github/workflows/preview.yml +++ b/.github/workflows/preview.yml @@ -10,6 +10,7 @@ concurrency: group: docs permissions: + actions: write contents: write pull-requests: write From 88c9c0d6203f207b92e779e3c20b1986a2eb0b55 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Wed, 15 Jan 2025 13:42:50 +0000 Subject: [PATCH 2/4] Reorganise developer documentation into better directory structure --- _quarto.yml | 46 +- .../compiler/design-overview}/index.qmd | 610 +++++++------- .../compiler/minituring-compiler}/index.qmd | 588 +++++++------- .../compiler/minituring-contexts}/index.qmd | 610 +++++++------- .../compiler/model-manual}/index.qmd | 2 + .../contributing}/index.qmd | 154 ++-- .../abstractmcmc-interface}/index.qmd | 644 +++++++-------- .../inference/abstractmcmc-turing}/index.qmd | 656 +++++++-------- .../implementing-samplers}/index.qmd | 4 +- .../variational-inference}/index.qmd | 768 +++++++++--------- 10 files changed, 2047 insertions(+), 2035 deletions(-) rename {tutorials/docs-05-for-developers-compiler => developers/compiler/design-overview}/index.qmd (98%) rename {tutorials/14-minituring => developers/compiler/minituring-compiler}/index.qmd (97%) rename {tutorials/16-contexts => developers/compiler/minituring-contexts}/index.qmd (97%) rename {tutorials/dev-model-manual => developers/compiler/model-manual}/index.qmd (96%) rename {tutorials/docs-01-contributing-guide => developers/contributing}/index.qmd (98%) rename {tutorials/docs-06-for-developers-interface => developers/inference/abstractmcmc-interface}/index.qmd (97%) rename {tutorials/docs-04-for-developers-abstractmcmc-turing => developers/inference/abstractmcmc-turing}/index.qmd (97%) rename {tutorials/docs-17-implementing-samplers => developers/inference/implementing-samplers}/index.qmd (99%) rename {tutorials/docs-07-for-developers-variational-inference => developers/inference/variational-inference}/index.qmd (98%) diff --git a/_quarto.yml b/_quarto.yml index 8d5a175c4..586af0f35 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -100,30 +100,23 @@ website: - section: "Developers" contents: - - section: "Contributing" - collapse-level: 1 - contents: - - text: "How to Contribute" - href: tutorials/docs-01-contributing-guide/index.qmd + - developers/contributing/index.qmd - - section: "DynamicPPL in Depth" + - section: "DynamicPPL's Compiler" collapse-level: 1 contents: - - tutorials/dev-model-manual/index.qmd - - tutorials/docs-05-for-developers-compiler/index.qmd - - text: "A Mini Turing Implementation I: Compiler" - href: tutorials/14-minituring/index.qmd - - text: "A Mini Turing Implementation II: Contexts" - href: tutorials/16-contexts/index.qmd + - developers/compiler/model-manual/index.qmd + - developers/compiler/minituring-compiler/index.qmd + - developers/compiler/minituring-contexts/index.qmd + - developers/compiler/design-overview/index.qmd - section: "Inference (note: outdated)" collapse-level: 1 contents: - - tutorials/docs-06-for-developers-interface/index.qmd - - tutorials/docs-04-for-developers-abstractmcmc-turing/index.qmd - - tutorials/docs-07-for-developers-variational-inference/index.qmd - - text: "Implementing Samplers" - href: tutorials/docs-17-implementing-samplers/index.qmd + - developers/inference/abstractmcmc-interface/index.qmd + - developers/inference/abstractmcmc-turing/index.qmd + - developers/inference/variational-inference/index.qmd + - developers/inference/implementing-samplers/index.qmd page-footer: background: "#073c44" @@ -180,13 +173,6 @@ bayesian-differential-equations: tutorials/10-bayesian-differential-equations probabilistic-pca: tutorials/11-probabilistic-pca gplvm: tutorials/12-gplvm seasonal-time-series: tutorials/13-seasonal-time-series -contexts: tutorials/16-contexts -minituring: tutorials/14-minituring -contributing-guide: tutorials/docs-01-contributing-guide -using-turing-abstractmcmc: tutorials/docs-04-for-developers-abstractmc-turing -using-turing-compiler: tutorials/docs-05-for-developers-compiler -using-turing-interface: tutorials/docs-06-for-developers-interface -using-turing-variational-inference: tutorials/docs-07-for-developers-variational-inference using-turing-advanced: tutorials/docs-09-using-turing-advanced using-turing-autodiff: tutorials/docs-10-using-turing-autodiff using-turing-dynamichmc: tutorials/docs-11-using-turing-dynamichmc @@ -194,10 +180,18 @@ using-turing: tutorials/docs-12-using-turing-guide using-turing-performance-tips: tutorials/docs-13-using-turing-performance-tips using-turing-sampler-viz: tutorials/docs-15-using-turing-sampler-viz using-turing-external-samplers: tutorials/docs-16-using-turing-external-samplers -using-turing-implementing-samplers: tutorials/docs-17-implementing-samplers using-turing-mode-estimation: tutorials/docs-17-mode-estimation usage-probability-interface: tutorials/usage-probability-interface usage-custom-distribution: tutorials/usage-custom-distribution usage-generated-quantities: tutorials/usage-generated-quantities usage-modifying-logprob: tutorials/usage-modifying-logprob -dev-model-manual: tutorials/dev-model-manual + +contributing-guide: developers/contributing +dev-model-manual: developers/compiler/model-manual +contexts: developers/compiler/minituring-contexts +minituring: developers/compiler/minituring-compiler +using-turing-compiler: developers/compiler/design-overview +using-turing-abstractmcmc: developers/inference/abstractmcmc-turing +using-turing-interface: developers/inference/abstractmcmc-interface +using-turing-variational-inference: developers/inference/variational-inference +using-turing-implementing-samplers: developers/inference/implementing-samplers diff --git a/tutorials/docs-05-for-developers-compiler/index.qmd b/developers/compiler/design-overview/index.qmd similarity index 98% rename from tutorials/docs-05-for-developers-compiler/index.qmd rename to developers/compiler/design-overview/index.qmd index 20013d92c..e389c44d0 100755 --- a/tutorials/docs-05-for-developers-compiler/index.qmd +++ b/developers/compiler/design-overview/index.qmd @@ -1,304 +1,306 @@ ---- -title: Turing Compiler Design -engine: julia ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this section, the current design of Turing's model "compiler" is described which enables Turing to perform various types of Bayesian inference without changing the model definition. The "compiler" is essentially just a macro that rewrites the user's model definition to a function that generates a `Model` struct that Julia's dispatch can operate on and that Julia's compiler can successfully do type inference on for efficient machine code generation. - -# Overview - -The following terminology will be used in this section: - - - `D`: observed data variables conditioned upon in the posterior, - - `P`: parameter variables distributed according to the prior distributions, these will also be referred to as random variables, - - `Model`: a fully defined probabilistic model with input data - -`Turing`'s `@model` macro rewrites the user-provided function definition such that it can be used to instantiate a `Model` by passing in the observed data `D`. - -The following are the main jobs of the `@model` macro: - - 1. Parse `~` and `.~` lines, e.g. `y .~ Normal.(c*x, 1.0)` - 2. Figure out if a variable belongs to the data `D` and or to the parameters `P` - 3. Enable the handling of missing data variables in `D` when defining a `Model` and treating them as parameter variables in `P` instead - 4. Enable the tracking of random variables using the data structures `VarName` and `VarInfo` - 5. Change `~`/`.~` lines with a variable in `P` on the LHS to a call to `tilde_assume` or `dot_tilde_assume` - 6. Change `~`/`.~` lines with a variable in `D` on the LHS to a call to `tilde_observe` or `dot_tilde_observe` - 7. Enable type stable automatic differentiation of the model using type parameters - -## The model - -A `model::Model` is a callable struct that one can sample from by calling - -```{julia} -#| eval: false -(model::Model)([rng, varinfo, sampler, context]) -``` - -where `rng` is a random number generator (default: `Random.default_rng()`), `varinfo` is a data structure that stores information -about the random variables (default: `DynamicPPL.VarInfo()`), `sampler` is a sampling algorithm (default: `DynamicPPL.SampleFromPrior()`), -and `context` is a sampling context that can, e.g., modify how the log probability is accumulated (default: `DynamicPPL.DefaultContext()`). - -Sampling resets the log joint probability of `varinfo` and increases the evaluation counter of `sampler`. If `context` is a `LikelihoodContext`, -only the log likelihood of `D` will be accumulated, whereas with `PriorContext` only the log prior probability of `P` is. With the `DefaultContext` the log joint probability of both `P` and `D` is accumulated. - -The `Model` struct contains the four internal fields `f`, `args`, `defaults`, and `context`. -When `model::Model` is called, then the internal function `model.f` is called as `model.f(rng, varinfo, sampler, context, model.args...)` -(for multithreaded sampling, instead of `varinfo` a threadsafe wrapper is passed to `model.f`). -The positional and keyword arguments that were passed to the user-defined model function when the model was created are saved as a `NamedTuple` -in `model.args`. The default values of the positional and keyword arguments of the user-defined model functions, if any, are saved as a `NamedTuple` -in `model.defaults`. They are used for constructing model instances with different arguments by the `logprob` and `prob` string macros. -The `context` variable sets an evaluation context that can be used to control for instance whether log probabilities should be evaluated for the prior, likelihood, or joint probability. By default it is set to evaluate the log joint. - -# Example - -Let's take the following model as an example: - -```{julia} -#| eval: false -@model function gauss( - x=missing, y=1.0, ::Type{TV}=Vector{Float64} -) where {TV<:AbstractVector} - if x === missing - x = TV(undef, 3) - end - p = TV(undef, 2) - p[1] ~ InverseGamma(2, 3) - p[2] ~ Normal(0, 1.0) - @. x[1:2] ~ Normal(p[2], sqrt(p[1])) - x[3] ~ Normal() - return y ~ Normal(p[2], sqrt(p[1])) -end -``` - -The above call of the `@model` macro defines the function `gauss` with positional arguments `x`, `y`, and `::Type{TV}`, rewritten in -such a way that every call of it returns a `model::Model`. Note that only the function body is modified by the `@model` macro, and the -function signature is left untouched. It is also possible to implement models with keyword arguments such as - -```{julia} -#| eval: false -@model function gauss( - ::Type{TV}=Vector{Float64}; x=missing, y=1.0 -) where {TV<:AbstractVector} - return ... -end -``` - -This would allow us to generate a model by calling `gauss(; x = rand(3))`. - -If an argument has a default value `missing`, it is treated as a random variable. For variables which require an initialization because we -need to loop or broadcast over its elements, such as `x` above, the following needs to be done: - -```{julia} -#| eval: false -if x === missing - x = ... -end -``` - -Note that since `gauss` behaves like a regular function it is possible to define additional dispatches in a second step as well. For -instance, we could achieve the same behaviour by - -```{julia} -#| eval: false -@model function gauss(x, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} - p = TV(undef, 2) - return ... -end - -function gauss(::Missing, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} - return gauss(TV(undef, 3), y, TV) -end -``` - -If `x` is sampled as a whole from a distribution and not indexed, e.g., `x ~ Normal(...)` or `x ~ MvNormal(...)`, -there is no need to initialize it in an `if`-block. - -## Step 1: Break up the model definition - -First, the `@model` macro breaks up the user-provided function definition using `DynamicPPL.build_model_info`. This function -returns a dictionary consisting of: - - - `allargs_exprs`: The expressions of the positional and keyword arguments, without default values. - - `allargs_syms`: The names of the positional and keyword arguments, e.g., `[:x, :y, :TV]` above. - - `allargs_namedtuple`: An expression that constructs a `NamedTuple` of the positional and keyword arguments, e.g., `:((x = x, y = y, TV = TV))` above. - - `defaults_namedtuple`: An expression that constructs a `NamedTuple` of the default positional and keyword arguments, if any, e.g., `:((x = missing, y = 1, TV = Vector{Float64}))` above. - - `modeldef`: A dictionary with the name, arguments, and function body of the model definition, as returned by `MacroTools.splitdef`. - -## Step 2: Generate the body of the internal model function - -In a second step, `DynamicPPL.generate_mainbody` generates the main part of the transformed function body using the user-provided function body -and the provided function arguments, without default values, for figuring out if a variable denotes an observation or a random variable. -Hereby the function `DynamicPPL.generate_tilde` replaces the `L ~ R` lines in the model and the function `DynamicPPL.generate_dot_tilde` replaces -the `@. L ~ R` and `L .~ R` lines in the model. - -In the above example, `p[1] ~ InverseGamma(2, 3)` is replaced with something similar to - -```{julia} -#| eval: false -#= REPL[25]:6 =# -begin - var"##tmpright#323" = InverseGamma(2, 3) - var"##tmpright#323" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( - ArgumentError( - "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", - ), - ) - var"##vn#325" = (DynamicPPL.VarName)(:p, ((1,),)) - var"##inds#326" = ((1,),) - p[1] = (DynamicPPL.tilde_assume)( - _rng, - _context, - _sampler, - var"##tmpright#323", - var"##vn#325", - var"##inds#326", - _varinfo, - ) -end -``` - -Here the first line is a so-called line number node that enables more helpful error messages by providing users with the exact location -of the error in their model definition. Then the right hand side (RHS) of the `~` is assigned to a variable (with an automatically generated name). -We check that the RHS is a distribution or an array of distributions, otherwise an error is thrown. -Next we extract a compact representation of the variable with its name and index (or indices). Finally, the `~` expression is replaced with -a call to `DynamicPPL.tilde_assume` since the compiler figured out that `p[1]` is a random variable using the following -heuristic: - - 1. If the symbol on the LHS of `~`, `:p` in this case, is not among the arguments to the model, `(:x, :y, :T)` in this case, it is a random variable. - 2. If the symbol on the LHS of `~`, `:p` in this case, is among the arguments to the model but has a value of `missing`, it is a random variable. - 3. If the value of the LHS of `~`, `p[1]` in this case, is `missing`, then it is a random variable. - 4. Otherwise, it is treated as an observation. - -The `DynamicPPL.tilde_assume` function takes care of sampling the random variable, if needed, and updating its value and the accumulated log joint -probability in the `_varinfo` object. If `L ~ R` is an observation, `DynamicPPL.tilde_observe` is called with the same arguments except the -random number generator `_rng` (since observations are never sampled). - -A similar transformation is performed for expressions of the form `@. L ~ R` and `L .~ R`. For instance, -`@. x[1:2] ~ Normal(p[2], sqrt(p[1]))` is replaced with - -```{julia} -#| eval: false -#= REPL[25]:8 =# -begin - var"##tmpright#331" = Normal.(p[2], sqrt.(p[1])) - var"##tmpright#331" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( - ArgumentError( - "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", - ), - ) - var"##vn#333" = (DynamicPPL.VarName)(:x, ((1:2,),)) - var"##inds#334" = ((1:2,),) - var"##isassumption#335" = begin - let var"##vn#336" = (DynamicPPL.VarName)(:x, ((1:2,),)) - if !((DynamicPPL.inargnames)(var"##vn#336", _model)) || - (DynamicPPL.inmissings)(var"##vn#336", _model) - true - else - x[1:2] === missing - end - end - end - if var"##isassumption#335" - x[1:2] .= (DynamicPPL.dot_tilde_assume)( - _rng, - _context, - _sampler, - var"##tmpright#331", - x[1:2], - var"##vn#333", - var"##inds#334", - _varinfo, - ) - else - (DynamicPPL.dot_tilde_observe)( - _context, - _sampler, - var"##tmpright#331", - x[1:2], - var"##vn#333", - var"##inds#334", - _varinfo, - ) - end -end -``` - -The main difference in the expanded code between `L ~ R` and `@. L ~ R` is that the former doesn't assume `L` to be defined, it can be a new Julia variable in the scope, while the latter assumes `L` already exists. Moreover, `DynamicPPL.dot_tilde_assume` and `DynamicPPL.dot_tilde_observe` are called -instead of `DynamicPPL.tilde_assume` and `DynamicPPL.tilde_observe`. - -## Step 3: Replace the user-provided function body - -Finally, we replace the user-provided function body using `DynamicPPL.build_output`. This function uses `MacroTools.combinedef` to reassemble -the user-provided function with a new function body. In the modified function body an anonymous function is created whose function body -was generated in step 2 above and whose arguments are - - - a random number generator `_rng`, - - a model `_model`, - - a datastructure `_varinfo`, - - a sampler `_sampler`, - - a sampling context `_context`, - - and all positional and keyword arguments of the user-provided model function as positional arguments - without any default values. Finally, in the new function body a `model::Model` with this anonymous function as internal function is returned. - -# `VarName` - -In order to track random variables in the sampling process, `Turing` uses the `VarName` struct which acts as a random variable identifier generated at runtime. The `VarName` of a random variable is generated from the expression on the LHS of a `~` statement when the symbol on the LHS is in the set `P` of unobserved random variables. Every `VarName` instance has a type parameter `sym` which is the symbol of the Julia variable in the model that the random variable belongs to. For example, `x[1] ~ Normal()` will generate an instance of `VarName{:x}` assuming `x` is an unobserved random variable. Every `VarName` also has a field `indexing`, which stores the indices required to access the random variable from the Julia variable indicated by `sym` as a tuple of tuples. Each element of the tuple thereby contains the indices of one indexing operation (`VarName` also supports hierarchical arrays and range indexing). Some examples: - - - `x ~ Normal()` will generate a `VarName(:x, ())`. - - `x[1] ~ Normal()` will generate a `VarName(:x, ((1,),))`. - - `x[:,1] ~ MvNormal(zeros(2), I)` will generate a `VarName(:x, ((Colon(), 1),))`. - - `x[:,1][1+1] ~ Normal()` will generate a `VarName(:x, ((Colon(), 1), (2,)))`. - -The easiest way to manually construct a `VarName` is to use the `@varname` macro on an indexing expression, which will take the `sym` value from the actual variable name, and put the index values appropriately into the constructor. - -# `VarInfo` - -## Overview - -`VarInfo` is the data structure in `Turing` that facilitates tracking random variables and certain metadata about them that are required for sampling. For instance, the distribution of every random variable is stored in `VarInfo` because we need to know the support of every random variable when sampling using HMC for example. Random variables whose distributions have a constrained support are transformed using a bijector from [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl) so that the sampling happens in the unconstrained space. Different samplers require different metadata about the random variables. - -The definition of `VarInfo` in `Turing` is: - -```{julia} -#| eval: false -struct VarInfo{Tmeta, Tlogp} <: AbstractVarInfo - metadata::Tmeta - logp::Base.RefValue{Tlogp} - num_produce::Base.RefValue{Int} -end -``` - -Based on the type of `metadata`, the `VarInfo` is either aliased `UntypedVarInfo` or `TypedVarInfo`. `metadata` can be either a subtype of the union type `Metadata` or a `NamedTuple` of multiple such subtypes. Let `vi` be an instance of `VarInfo`. If `vi isa VarInfo{<:Metadata}`, then it is called an `UntypedVarInfo`. If `vi isa VarInfo{<:NamedTuple}`, then `vi.metadata` would be a `NamedTuple` mapping each symbol in `P` to an instance of `Metadata`. `vi` would then be called a `TypedVarInfo`. The other fields of `VarInfo` include `logp` which is used to accumulate the log probability or log probability density of the variables in `P` and `D`. `num_produce` keeps track of how many observations have been made in the model so far. This is incremented when running a `~` statement when the symbol on the LHS is in `D`. - -## `Metadata` - -The `Metadata` struct stores some metadata about the random variables sampled. This helps -query certain information about a variable such as: its distribution, which samplers -sample this variable, its value and whether this value is transformed to real space or -not. Let `md` be an instance of `Metadata`: - - - `md.vns` is the vector of all `VarName` instances. Let `vn` be an arbitrary element of `md.vns` - - `md.idcs` is the dictionary that maps each `VarName` instance to its index in - `md.vns`, `md.ranges`, `md.dists`, `md.orders` and `md.flags`. - - `md.vns[md.idcs[vn]] == vn`. - - `md.dists[md.idcs[vn]]` is the distribution of `vn`. - - `md.gids[md.idcs[vn]]` is the set of algorithms used to sample `vn`. This is used in - the Gibbs sampling process. - - `md.orders[md.idcs[vn]]` is the number of `observe` statements before `vn` is sampled. - - `md.ranges[md.idcs[vn]]` is the index range of `vn` in `md.vals`. - - `md.vals[md.ranges[md.idcs[vn]]]` is the linearized vector of values of corresponding to `vn`. - - `md.flags` is a dictionary of true/false flags. `md.flags[flag][md.idcs[vn]]` is the - value of `flag` corresponding to `vn`. - -Note that in order to make `md::Metadata` type stable, all the `md.vns` must have the same symbol and distribution type. However, one can have a single Julia variable, e.g. `x`, that is a matrix or a hierarchical array sampled in partitions, e.g. `x[1][:] ~ MvNormal(zeros(2), I); x[2][:] ~ MvNormal(ones(2), I)`. The symbol `x` can still be managed by a single `md::Metadata` without hurting the type stability since all the distributions on the RHS of `~` are of the same type. - -However, in `Turing` models one cannot have this restriction, so we must use a type unstable `Metadata` if we want to use one `Metadata` instance for the whole model. This is what `UntypedVarInfo` does. A type unstable `Metadata` will still work but will have inferior performance. - -To strike a balance between flexibility and performance when constructing the `spl::Sampler` instance, the model is first run by sampling the parameters in `P` from their priors using an `UntypedVarInfo`, i.e. a type unstable `Metadata` is used for all the variables. Then once all the symbols and distribution types have been identified, a `vi::TypedVarInfo` is constructed where `vi.metadata` is a `NamedTuple` mapping each symbol in `P` to a specialized instance of `Metadata`. So as long as each symbol in `P` is sampled from only one type of distributions, `vi::TypedVarInfo` will have fully concretely typed fields which brings out the peak performance of Julia. +--- +title: Turing Compiler Design (Outdated) +engine: julia +aliases: + - ../../../tutorials/docs-05-for-developers-compiler/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this section, the current design of Turing's model "compiler" is described which enables Turing to perform various types of Bayesian inference without changing the model definition. The "compiler" is essentially just a macro that rewrites the user's model definition to a function that generates a `Model` struct that Julia's dispatch can operate on and that Julia's compiler can successfully do type inference on for efficient machine code generation. + +# Overview + +The following terminology will be used in this section: + + - `D`: observed data variables conditioned upon in the posterior, + - `P`: parameter variables distributed according to the prior distributions, these will also be referred to as random variables, + - `Model`: a fully defined probabilistic model with input data + +`Turing`'s `@model` macro rewrites the user-provided function definition such that it can be used to instantiate a `Model` by passing in the observed data `D`. + +The following are the main jobs of the `@model` macro: + + 1. Parse `~` and `.~` lines, e.g. `y .~ Normal.(c*x, 1.0)` + 2. Figure out if a variable belongs to the data `D` and or to the parameters `P` + 3. Enable the handling of missing data variables in `D` when defining a `Model` and treating them as parameter variables in `P` instead + 4. Enable the tracking of random variables using the data structures `VarName` and `VarInfo` + 5. Change `~`/`.~` lines with a variable in `P` on the LHS to a call to `tilde_assume` or `dot_tilde_assume` + 6. Change `~`/`.~` lines with a variable in `D` on the LHS to a call to `tilde_observe` or `dot_tilde_observe` + 7. Enable type stable automatic differentiation of the model using type parameters + +## The model + +A `model::Model` is a callable struct that one can sample from by calling + +```{julia} +#| eval: false +(model::Model)([rng, varinfo, sampler, context]) +``` + +where `rng` is a random number generator (default: `Random.default_rng()`), `varinfo` is a data structure that stores information +about the random variables (default: `DynamicPPL.VarInfo()`), `sampler` is a sampling algorithm (default: `DynamicPPL.SampleFromPrior()`), +and `context` is a sampling context that can, e.g., modify how the log probability is accumulated (default: `DynamicPPL.DefaultContext()`). + +Sampling resets the log joint probability of `varinfo` and increases the evaluation counter of `sampler`. If `context` is a `LikelihoodContext`, +only the log likelihood of `D` will be accumulated, whereas with `PriorContext` only the log prior probability of `P` is. With the `DefaultContext` the log joint probability of both `P` and `D` is accumulated. + +The `Model` struct contains the four internal fields `f`, `args`, `defaults`, and `context`. +When `model::Model` is called, then the internal function `model.f` is called as `model.f(rng, varinfo, sampler, context, model.args...)` +(for multithreaded sampling, instead of `varinfo` a threadsafe wrapper is passed to `model.f`). +The positional and keyword arguments that were passed to the user-defined model function when the model was created are saved as a `NamedTuple` +in `model.args`. The default values of the positional and keyword arguments of the user-defined model functions, if any, are saved as a `NamedTuple` +in `model.defaults`. They are used for constructing model instances with different arguments by the `logprob` and `prob` string macros. +The `context` variable sets an evaluation context that can be used to control for instance whether log probabilities should be evaluated for the prior, likelihood, or joint probability. By default it is set to evaluate the log joint. + +# Example + +Let's take the following model as an example: + +```{julia} +#| eval: false +@model function gauss( + x=missing, y=1.0, ::Type{TV}=Vector{Float64} +) where {TV<:AbstractVector} + if x === missing + x = TV(undef, 3) + end + p = TV(undef, 2) + p[1] ~ InverseGamma(2, 3) + p[2] ~ Normal(0, 1.0) + @. x[1:2] ~ Normal(p[2], sqrt(p[1])) + x[3] ~ Normal() + return y ~ Normal(p[2], sqrt(p[1])) +end +``` + +The above call of the `@model` macro defines the function `gauss` with positional arguments `x`, `y`, and `::Type{TV}`, rewritten in +such a way that every call of it returns a `model::Model`. Note that only the function body is modified by the `@model` macro, and the +function signature is left untouched. It is also possible to implement models with keyword arguments such as + +```{julia} +#| eval: false +@model function gauss( + ::Type{TV}=Vector{Float64}; x=missing, y=1.0 +) where {TV<:AbstractVector} + return ... +end +``` + +This would allow us to generate a model by calling `gauss(; x = rand(3))`. + +If an argument has a default value `missing`, it is treated as a random variable. For variables which require an initialization because we +need to loop or broadcast over its elements, such as `x` above, the following needs to be done: + +```{julia} +#| eval: false +if x === missing + x = ... +end +``` + +Note that since `gauss` behaves like a regular function it is possible to define additional dispatches in a second step as well. For +instance, we could achieve the same behaviour by + +```{julia} +#| eval: false +@model function gauss(x, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} + p = TV(undef, 2) + return ... +end + +function gauss(::Missing, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} + return gauss(TV(undef, 3), y, TV) +end +``` + +If `x` is sampled as a whole from a distribution and not indexed, e.g., `x ~ Normal(...)` or `x ~ MvNormal(...)`, +there is no need to initialize it in an `if`-block. + +## Step 1: Break up the model definition + +First, the `@model` macro breaks up the user-provided function definition using `DynamicPPL.build_model_info`. This function +returns a dictionary consisting of: + + - `allargs_exprs`: The expressions of the positional and keyword arguments, without default values. + - `allargs_syms`: The names of the positional and keyword arguments, e.g., `[:x, :y, :TV]` above. + - `allargs_namedtuple`: An expression that constructs a `NamedTuple` of the positional and keyword arguments, e.g., `:((x = x, y = y, TV = TV))` above. + - `defaults_namedtuple`: An expression that constructs a `NamedTuple` of the default positional and keyword arguments, if any, e.g., `:((x = missing, y = 1, TV = Vector{Float64}))` above. + - `modeldef`: A dictionary with the name, arguments, and function body of the model definition, as returned by `MacroTools.splitdef`. + +## Step 2: Generate the body of the internal model function + +In a second step, `DynamicPPL.generate_mainbody` generates the main part of the transformed function body using the user-provided function body +and the provided function arguments, without default values, for figuring out if a variable denotes an observation or a random variable. +Hereby the function `DynamicPPL.generate_tilde` replaces the `L ~ R` lines in the model and the function `DynamicPPL.generate_dot_tilde` replaces +the `@. L ~ R` and `L .~ R` lines in the model. + +In the above example, `p[1] ~ InverseGamma(2, 3)` is replaced with something similar to + +```{julia} +#| eval: false +#= REPL[25]:6 =# +begin + var"##tmpright#323" = InverseGamma(2, 3) + var"##tmpright#323" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( + ArgumentError( + "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", + ), + ) + var"##vn#325" = (DynamicPPL.VarName)(:p, ((1,),)) + var"##inds#326" = ((1,),) + p[1] = (DynamicPPL.tilde_assume)( + _rng, + _context, + _sampler, + var"##tmpright#323", + var"##vn#325", + var"##inds#326", + _varinfo, + ) +end +``` + +Here the first line is a so-called line number node that enables more helpful error messages by providing users with the exact location +of the error in their model definition. Then the right hand side (RHS) of the `~` is assigned to a variable (with an automatically generated name). +We check that the RHS is a distribution or an array of distributions, otherwise an error is thrown. +Next we extract a compact representation of the variable with its name and index (or indices). Finally, the `~` expression is replaced with +a call to `DynamicPPL.tilde_assume` since the compiler figured out that `p[1]` is a random variable using the following +heuristic: + + 1. If the symbol on the LHS of `~`, `:p` in this case, is not among the arguments to the model, `(:x, :y, :T)` in this case, it is a random variable. + 2. If the symbol on the LHS of `~`, `:p` in this case, is among the arguments to the model but has a value of `missing`, it is a random variable. + 3. If the value of the LHS of `~`, `p[1]` in this case, is `missing`, then it is a random variable. + 4. Otherwise, it is treated as an observation. + +The `DynamicPPL.tilde_assume` function takes care of sampling the random variable, if needed, and updating its value and the accumulated log joint +probability in the `_varinfo` object. If `L ~ R` is an observation, `DynamicPPL.tilde_observe` is called with the same arguments except the +random number generator `_rng` (since observations are never sampled). + +A similar transformation is performed for expressions of the form `@. L ~ R` and `L .~ R`. For instance, +`@. x[1:2] ~ Normal(p[2], sqrt(p[1]))` is replaced with + +```{julia} +#| eval: false +#= REPL[25]:8 =# +begin + var"##tmpright#331" = Normal.(p[2], sqrt.(p[1])) + var"##tmpright#331" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( + ArgumentError( + "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", + ), + ) + var"##vn#333" = (DynamicPPL.VarName)(:x, ((1:2,),)) + var"##inds#334" = ((1:2,),) + var"##isassumption#335" = begin + let var"##vn#336" = (DynamicPPL.VarName)(:x, ((1:2,),)) + if !((DynamicPPL.inargnames)(var"##vn#336", _model)) || + (DynamicPPL.inmissings)(var"##vn#336", _model) + true + else + x[1:2] === missing + end + end + end + if var"##isassumption#335" + x[1:2] .= (DynamicPPL.dot_tilde_assume)( + _rng, + _context, + _sampler, + var"##tmpright#331", + x[1:2], + var"##vn#333", + var"##inds#334", + _varinfo, + ) + else + (DynamicPPL.dot_tilde_observe)( + _context, + _sampler, + var"##tmpright#331", + x[1:2], + var"##vn#333", + var"##inds#334", + _varinfo, + ) + end +end +``` + +The main difference in the expanded code between `L ~ R` and `@. L ~ R` is that the former doesn't assume `L` to be defined, it can be a new Julia variable in the scope, while the latter assumes `L` already exists. Moreover, `DynamicPPL.dot_tilde_assume` and `DynamicPPL.dot_tilde_observe` are called +instead of `DynamicPPL.tilde_assume` and `DynamicPPL.tilde_observe`. + +## Step 3: Replace the user-provided function body + +Finally, we replace the user-provided function body using `DynamicPPL.build_output`. This function uses `MacroTools.combinedef` to reassemble +the user-provided function with a new function body. In the modified function body an anonymous function is created whose function body +was generated in step 2 above and whose arguments are + + - a random number generator `_rng`, + - a model `_model`, + - a datastructure `_varinfo`, + - a sampler `_sampler`, + - a sampling context `_context`, + - and all positional and keyword arguments of the user-provided model function as positional arguments + without any default values. Finally, in the new function body a `model::Model` with this anonymous function as internal function is returned. + +# `VarName` + +In order to track random variables in the sampling process, `Turing` uses the `VarName` struct which acts as a random variable identifier generated at runtime. The `VarName` of a random variable is generated from the expression on the LHS of a `~` statement when the symbol on the LHS is in the set `P` of unobserved random variables. Every `VarName` instance has a type parameter `sym` which is the symbol of the Julia variable in the model that the random variable belongs to. For example, `x[1] ~ Normal()` will generate an instance of `VarName{:x}` assuming `x` is an unobserved random variable. Every `VarName` also has a field `indexing`, which stores the indices required to access the random variable from the Julia variable indicated by `sym` as a tuple of tuples. Each element of the tuple thereby contains the indices of one indexing operation (`VarName` also supports hierarchical arrays and range indexing). Some examples: + + - `x ~ Normal()` will generate a `VarName(:x, ())`. + - `x[1] ~ Normal()` will generate a `VarName(:x, ((1,),))`. + - `x[:,1] ~ MvNormal(zeros(2), I)` will generate a `VarName(:x, ((Colon(), 1),))`. + - `x[:,1][1+1] ~ Normal()` will generate a `VarName(:x, ((Colon(), 1), (2,)))`. + +The easiest way to manually construct a `VarName` is to use the `@varname` macro on an indexing expression, which will take the `sym` value from the actual variable name, and put the index values appropriately into the constructor. + +# `VarInfo` + +## Overview + +`VarInfo` is the data structure in `Turing` that facilitates tracking random variables and certain metadata about them that are required for sampling. For instance, the distribution of every random variable is stored in `VarInfo` because we need to know the support of every random variable when sampling using HMC for example. Random variables whose distributions have a constrained support are transformed using a bijector from [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl) so that the sampling happens in the unconstrained space. Different samplers require different metadata about the random variables. + +The definition of `VarInfo` in `Turing` is: + +```{julia} +#| eval: false +struct VarInfo{Tmeta, Tlogp} <: AbstractVarInfo + metadata::Tmeta + logp::Base.RefValue{Tlogp} + num_produce::Base.RefValue{Int} +end +``` + +Based on the type of `metadata`, the `VarInfo` is either aliased `UntypedVarInfo` or `TypedVarInfo`. `metadata` can be either a subtype of the union type `Metadata` or a `NamedTuple` of multiple such subtypes. Let `vi` be an instance of `VarInfo`. If `vi isa VarInfo{<:Metadata}`, then it is called an `UntypedVarInfo`. If `vi isa VarInfo{<:NamedTuple}`, then `vi.metadata` would be a `NamedTuple` mapping each symbol in `P` to an instance of `Metadata`. `vi` would then be called a `TypedVarInfo`. The other fields of `VarInfo` include `logp` which is used to accumulate the log probability or log probability density of the variables in `P` and `D`. `num_produce` keeps track of how many observations have been made in the model so far. This is incremented when running a `~` statement when the symbol on the LHS is in `D`. + +## `Metadata` + +The `Metadata` struct stores some metadata about the random variables sampled. This helps +query certain information about a variable such as: its distribution, which samplers +sample this variable, its value and whether this value is transformed to real space or +not. Let `md` be an instance of `Metadata`: + + - `md.vns` is the vector of all `VarName` instances. Let `vn` be an arbitrary element of `md.vns` + - `md.idcs` is the dictionary that maps each `VarName` instance to its index in + `md.vns`, `md.ranges`, `md.dists`, `md.orders` and `md.flags`. + - `md.vns[md.idcs[vn]] == vn`. + - `md.dists[md.idcs[vn]]` is the distribution of `vn`. + - `md.gids[md.idcs[vn]]` is the set of algorithms used to sample `vn`. This is used in + the Gibbs sampling process. + - `md.orders[md.idcs[vn]]` is the number of `observe` statements before `vn` is sampled. + - `md.ranges[md.idcs[vn]]` is the index range of `vn` in `md.vals`. + - `md.vals[md.ranges[md.idcs[vn]]]` is the linearized vector of values of corresponding to `vn`. + - `md.flags` is a dictionary of true/false flags. `md.flags[flag][md.idcs[vn]]` is the + value of `flag` corresponding to `vn`. + +Note that in order to make `md::Metadata` type stable, all the `md.vns` must have the same symbol and distribution type. However, one can have a single Julia variable, e.g. `x`, that is a matrix or a hierarchical array sampled in partitions, e.g. `x[1][:] ~ MvNormal(zeros(2), I); x[2][:] ~ MvNormal(ones(2), I)`. The symbol `x` can still be managed by a single `md::Metadata` without hurting the type stability since all the distributions on the RHS of `~` are of the same type. + +However, in `Turing` models one cannot have this restriction, so we must use a type unstable `Metadata` if we want to use one `Metadata` instance for the whole model. This is what `UntypedVarInfo` does. A type unstable `Metadata` will still work but will have inferior performance. + +To strike a balance between flexibility and performance when constructing the `spl::Sampler` instance, the model is first run by sampling the parameters in `P` from their priors using an `UntypedVarInfo`, i.e. a type unstable `Metadata` is used for all the variables. Then once all the symbols and distribution types have been identified, a `vi::TypedVarInfo` is constructed where `vi.metadata` is a `NamedTuple` mapping each symbol in `P` to a specialized instance of `Metadata`. So as long as each symbol in `P` is sampled from only one type of distributions, `vi::TypedVarInfo` will have fully concretely typed fields which brings out the peak performance of Julia. diff --git a/tutorials/14-minituring/index.qmd b/developers/compiler/minituring-compiler/index.qmd similarity index 97% rename from tutorials/14-minituring/index.qmd rename to developers/compiler/minituring-compiler/index.qmd index 49ab6bcad..c22894b17 100755 --- a/tutorials/14-minituring/index.qmd +++ b/developers/compiler/minituring-compiler/index.qmd @@ -1,293 +1,295 @@ ---- -title: "A Mini Turing Implementation I: Compiler" -engine: julia ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this tutorial we develop a very simple probabilistic programming language. -The implementation is similar to [DynamicPPL](https://github.com/TuringLang/DynamicPPL.jl). -This is intentional as we want to demonstrate some key ideas from Turing's internal implementation. - -To make things easy to understand and to implement we restrict our language to a very simple subset of the language that Turing actually supports. -Defining an accurate syntax description is not our goal here, instead, we give a simple example and all similar programs should work. - -# Consider a probabilistic model defined by - -$$ -\begin{aligned} -a &\sim \operatorname{Normal}(0.5, 1^2) \\ -b &\sim \operatorname{Normal}(a, 2^2) \\ -x &\sim \operatorname{Normal}(b, 0.5^2) -\end{aligned} -$$ - -We assume that `x` is data, i.e., an observed variable. -In our small language this model will be defined as - -```{julia} -#| eval: false -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end -``` - -Specifically, we demand that - - - all observed variables are arguments of the program, - - the model definition does not contain any control flow, - - all variables are scalars, and - - the function returns `nothing`. - -First, we import some required packages: - -```{julia} -using MacroTools, Distributions, Random, AbstractMCMC, MCMCChains -``` - -Before getting to the actual "compiler", we first build the data structure for the program trace. -A program trace for a probabilistic programming language needs to at least record the values of stochastic variables and their log-probabilities. - -```{julia} -struct VarInfo{V,L} - values::V - logps::L -end - -VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) - -function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) - varinfo.values[var_id] = value - varinfo.logps[var_id] = logp - return varinfo -end -``` - -Internally, our probabilistic programming language works with two main functions: - - - `assume` for sampling unobserved variables and computing their log-probabilities, and - - `observe` for computing log-probabilities of observed variables (but not sampling them). - -For different inference algorithms we may have to use different sampling procedures and different log-probability computations. -For instance, in some cases we might want to sample all variables from their prior distributions and in other cases we might only want to compute the log-likelihood of the observations based on a given set of values for the unobserved variables. -Thus depending on the inference algorithm we want to use different `assume` and `observe` implementations. -We can achieve this by providing this `context` information as a function argument to `assume` and `observe`. - -**Note:** *Although the context system in this tutorial is inspired by DynamicPPL, it is very simplistic. -We expand this mini Turing example in the [contexts]({{}}) tutorial with some more complexity, to illustrate how and why contexts are central to Turing's design. For the full details one still needs to go to the actual source of DynamicPPL though.* - -Here we can see the implementation of a sampler that draws values of unobserved variables from the prior and computes the log-probability for every variable. - -```{julia} -struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} - rng::R - sampler::S -end - -struct PriorSampler <: AbstractMCMC.AbstractSampler end - -function observe(context::SamplingContext, varinfo, dist, var_id, var_value) - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return nothing -end - -function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) - sample = Random.rand(context.rng, dist) - logp = logpdf(dist, sample) - varinfo[var_id] = (sample, logp) - return sample -end; -``` - -Next we define the "compiler" for our simple programming language. -The term compiler is actually a bit misleading here since its only purpose is to transform the function definition in the `@mini_model` macro by - - - adding the context information (`context`) and the tracing data structure (`varinfo`) as additional arguments, and - - replacing tildes with calls to `assume` and `observe`. - -Afterwards, as usual the Julia compiler will just-in-time compile the model function when it is called. - -The manipulation of Julia expressions is an advanced part of the Julia language. -The [Julia documentation](https://docs.julialang.org/en/v1/manual/metaprogramming/) provides an introduction to and more details about this so-called metaprogramming. - -```{julia} -macro mini_model(expr) - return esc(mini_model(expr)) -end - -function mini_model(expr) - # Split the function definition into a dictionary with its name, arguments, body etc. - def = MacroTools.splitdef(expr) - - # Replace tildes in the function body with calls to `assume` or `observe` - def[:body] = MacroTools.postwalk(def[:body]) do sub_expr - if MacroTools.@capture(sub_expr, var_ ~ dist_) - if var in def[:args] - # If the variable is an argument of the model function, it is observed - return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) - else - # Otherwise it is unobserved - return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) - end - else - return sub_expr - end - end - - # Add `context` and `varinfo` arguments to the model function - def[:args] = vcat(:varinfo, :context, def[:args]) - - # Reassemble the function definition from its name, arguments, body etc. - return MacroTools.combinedef(def) -end; -``` - -For inference, we make use of the [AbstractMCMC interface](https://turinglang.github.io/AbstractMCMC.jl/dev/). -It provides a default implementation of a `sample` function for sampling a Markov chain. -The default implementation already supports e.g. sampling of multiple chains in parallel, thinning of samples, or discarding initial samples. - -The AbstractMCMC interface requires us to at least - - - define a model that is a subtype of `AbstractMCMC.AbstractModel`, - - define a sampler that is a subtype of `AbstractMCMC.AbstractSampler`, - - implement `AbstractMCMC.step` for our model and sampler. - -Thus here we define a `MiniModel` model. -In this model we store the model function and the observed data. - -```{julia} -struct MiniModel{F,D} <: AbstractMCMC.AbstractModel - f::F - data::D # a NamedTuple of all the data -end -``` - -In the Turing compiler, the model-specific `DynamicPPL.Model` is constructed automatically when calling the model function. -But for the sake of simplicity here we construct the model manually. - -To illustrate probabilistic inference with our mini language we implement an extremely simplistic Random-Walk Metropolis-Hastings sampler. -We hard-code the proposal step as part of the sampler and only allow normal distributions with zero mean and fixed standard deviation. -The Metropolis-Hastings sampler in Turing is more flexible. - -```{julia} -struct MHSampler{T<:Real} <: AbstractMCMC.AbstractSampler - sigma::T -end - -MHSampler() = MHSampler(1) - -function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) - sampler = context.sampler - old_value = varinfo.values[var_id] - - # propose a random-walk step, i.e, add the current value to a random - # value sampled from a Normal distribution centered at 0 - value = rand(context.rng, Normal(old_value, sampler.sigma)) - logp = Distributions.logpdf(dist, value) - varinfo[var_id] = (value, logp) - - return value -end; -``` - -We need to define two `step` functions, one for the first step and the other for the following steps. -In the first step we sample values from the prior distributions and in the following steps we sample with the random-walk proposal. -The two functions are identified by the different arguments they take. - -```{julia} -# The fist step: Sampling from the prior distributions -function AbstractMCMC.step( - rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... -) - vi = VarInfo() - ctx = SamplingContext(rng, PriorSampler()) - model.f(vi, ctx, values(model.data)...) - return vi, vi -end - -# The following steps: Sampling with random-walk proposal -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model::MiniModel, - sampler::MHSampler, - prev_state::VarInfo; # is just the old trace - kwargs..., -) - vi = prev_state - new_vi = deepcopy(vi) - ctx = SamplingContext(rng, sampler) - model.f(new_vi, ctx, values(model.data)...) - - # Compute log acceptance probability - # Since the proposal is symmetric the computation can be simplified - logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) - - # Accept proposal with computed acceptance probability - if -randexp(rng) < logα - return new_vi, new_vi - else - return prev_state, prev_state - end -end; -``` - -To make it easier to analyze the samples and compare them with results from Turing, additionally we define a version of `AbstractMCMC.bundle_samples` for our model and sampler that returns a `MCMCChains.Chains` object of samples. - -```{julia} -function AbstractMCMC.bundle_samples( - samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... -) - # We get a vector of traces - values = [sample.values for sample in samples] - params = [key for key in keys(values[1]) if key ∉ keys(model.data)] - vals = reduce(hcat, [value[p] for value in values] for p in params) - # Composing the `Chains` data-structure, of which analyzing infrastructure is provided - chains = Chains(vals, params) - return chains -end; -``` - -Let us check how our mini probabilistic programming language works. -We define the probabilistic model: - -```{julia} -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end; -``` - -We perform inference with data `x = 3.0`: - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(), 1_000_000; chain_type=Chains, progress=false) -``` - -We compare these results with Turing. - -```{julia} -using Turing -using PDMats - -@model function turing_m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end - -sample(turing_m(3.0), MH(ScalMat(2, 1.0)), 1_000_000, progress=false) -``` - -As you can see, with our simple probabilistic programming language and custom samplers we get similar results as Turing. +--- +title: "A Mini Turing Implementation I: Compiler" +engine: julia +aliases: + - ../../../tutorials/14-minituring/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this tutorial we develop a very simple probabilistic programming language. +The implementation is similar to [DynamicPPL](https://github.com/TuringLang/DynamicPPL.jl). +This is intentional as we want to demonstrate some key ideas from Turing's internal implementation. + +To make things easy to understand and to implement we restrict our language to a very simple subset of the language that Turing actually supports. +Defining an accurate syntax description is not our goal here, instead, we give a simple example and all similar programs should work. + +# Consider a probabilistic model defined by + +$$ +\begin{aligned} +a &\sim \operatorname{Normal}(0.5, 1^2) \\ +b &\sim \operatorname{Normal}(a, 2^2) \\ +x &\sim \operatorname{Normal}(b, 0.5^2) +\end{aligned} +$$ + +We assume that `x` is data, i.e., an observed variable. +In our small language this model will be defined as + +```{julia} +#| eval: false +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end +``` + +Specifically, we demand that + + - all observed variables are arguments of the program, + - the model definition does not contain any control flow, + - all variables are scalars, and + - the function returns `nothing`. + +First, we import some required packages: + +```{julia} +using MacroTools, Distributions, Random, AbstractMCMC, MCMCChains +``` + +Before getting to the actual "compiler", we first build the data structure for the program trace. +A program trace for a probabilistic programming language needs to at least record the values of stochastic variables and their log-probabilities. + +```{julia} +struct VarInfo{V,L} + values::V + logps::L +end + +VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) + +function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) + varinfo.values[var_id] = value + varinfo.logps[var_id] = logp + return varinfo +end +``` + +Internally, our probabilistic programming language works with two main functions: + + - `assume` for sampling unobserved variables and computing their log-probabilities, and + - `observe` for computing log-probabilities of observed variables (but not sampling them). + +For different inference algorithms we may have to use different sampling procedures and different log-probability computations. +For instance, in some cases we might want to sample all variables from their prior distributions and in other cases we might only want to compute the log-likelihood of the observations based on a given set of values for the unobserved variables. +Thus depending on the inference algorithm we want to use different `assume` and `observe` implementations. +We can achieve this by providing this `context` information as a function argument to `assume` and `observe`. + +**Note:** *Although the context system in this tutorial is inspired by DynamicPPL, it is very simplistic. +We expand this mini Turing example in the [contexts]({{}}) tutorial with some more complexity, to illustrate how and why contexts are central to Turing's design. For the full details one still needs to go to the actual source of DynamicPPL though.* + +Here we can see the implementation of a sampler that draws values of unobserved variables from the prior and computes the log-probability for every variable. + +```{julia} +struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} + rng::R + sampler::S +end + +struct PriorSampler <: AbstractMCMC.AbstractSampler end + +function observe(context::SamplingContext, varinfo, dist, var_id, var_value) + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return nothing +end + +function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) + sample = Random.rand(context.rng, dist) + logp = logpdf(dist, sample) + varinfo[var_id] = (sample, logp) + return sample +end; +``` + +Next we define the "compiler" for our simple programming language. +The term compiler is actually a bit misleading here since its only purpose is to transform the function definition in the `@mini_model` macro by + + - adding the context information (`context`) and the tracing data structure (`varinfo`) as additional arguments, and + - replacing tildes with calls to `assume` and `observe`. + +Afterwards, as usual the Julia compiler will just-in-time compile the model function when it is called. + +The manipulation of Julia expressions is an advanced part of the Julia language. +The [Julia documentation](https://docs.julialang.org/en/v1/manual/metaprogramming/) provides an introduction to and more details about this so-called metaprogramming. + +```{julia} +macro mini_model(expr) + return esc(mini_model(expr)) +end + +function mini_model(expr) + # Split the function definition into a dictionary with its name, arguments, body etc. + def = MacroTools.splitdef(expr) + + # Replace tildes in the function body with calls to `assume` or `observe` + def[:body] = MacroTools.postwalk(def[:body]) do sub_expr + if MacroTools.@capture(sub_expr, var_ ~ dist_) + if var in def[:args] + # If the variable is an argument of the model function, it is observed + return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) + else + # Otherwise it is unobserved + return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) + end + else + return sub_expr + end + end + + # Add `context` and `varinfo` arguments to the model function + def[:args] = vcat(:varinfo, :context, def[:args]) + + # Reassemble the function definition from its name, arguments, body etc. + return MacroTools.combinedef(def) +end; +``` + +For inference, we make use of the [AbstractMCMC interface](https://turinglang.github.io/AbstractMCMC.jl/dev/). +It provides a default implementation of a `sample` function for sampling a Markov chain. +The default implementation already supports e.g. sampling of multiple chains in parallel, thinning of samples, or discarding initial samples. + +The AbstractMCMC interface requires us to at least + + - define a model that is a subtype of `AbstractMCMC.AbstractModel`, + - define a sampler that is a subtype of `AbstractMCMC.AbstractSampler`, + - implement `AbstractMCMC.step` for our model and sampler. + +Thus here we define a `MiniModel` model. +In this model we store the model function and the observed data. + +```{julia} +struct MiniModel{F,D} <: AbstractMCMC.AbstractModel + f::F + data::D # a NamedTuple of all the data +end +``` + +In the Turing compiler, the model-specific `DynamicPPL.Model` is constructed automatically when calling the model function. +But for the sake of simplicity here we construct the model manually. + +To illustrate probabilistic inference with our mini language we implement an extremely simplistic Random-Walk Metropolis-Hastings sampler. +We hard-code the proposal step as part of the sampler and only allow normal distributions with zero mean and fixed standard deviation. +The Metropolis-Hastings sampler in Turing is more flexible. + +```{julia} +struct MHSampler{T<:Real} <: AbstractMCMC.AbstractSampler + sigma::T +end + +MHSampler() = MHSampler(1) + +function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) + sampler = context.sampler + old_value = varinfo.values[var_id] + + # propose a random-walk step, i.e, add the current value to a random + # value sampled from a Normal distribution centered at 0 + value = rand(context.rng, Normal(old_value, sampler.sigma)) + logp = Distributions.logpdf(dist, value) + varinfo[var_id] = (value, logp) + + return value +end; +``` + +We need to define two `step` functions, one for the first step and the other for the following steps. +In the first step we sample values from the prior distributions and in the following steps we sample with the random-walk proposal. +The two functions are identified by the different arguments they take. + +```{julia} +# The fist step: Sampling from the prior distributions +function AbstractMCMC.step( + rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... +) + vi = VarInfo() + ctx = SamplingContext(rng, PriorSampler()) + model.f(vi, ctx, values(model.data)...) + return vi, vi +end + +# The following steps: Sampling with random-walk proposal +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model::MiniModel, + sampler::MHSampler, + prev_state::VarInfo; # is just the old trace + kwargs..., +) + vi = prev_state + new_vi = deepcopy(vi) + ctx = SamplingContext(rng, sampler) + model.f(new_vi, ctx, values(model.data)...) + + # Compute log acceptance probability + # Since the proposal is symmetric the computation can be simplified + logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) + + # Accept proposal with computed acceptance probability + if -randexp(rng) < logα + return new_vi, new_vi + else + return prev_state, prev_state + end +end; +``` + +To make it easier to analyze the samples and compare them with results from Turing, additionally we define a version of `AbstractMCMC.bundle_samples` for our model and sampler that returns a `MCMCChains.Chains` object of samples. + +```{julia} +function AbstractMCMC.bundle_samples( + samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... +) + # We get a vector of traces + values = [sample.values for sample in samples] + params = [key for key in keys(values[1]) if key ∉ keys(model.data)] + vals = reduce(hcat, [value[p] for value in values] for p in params) + # Composing the `Chains` data-structure, of which analyzing infrastructure is provided + chains = Chains(vals, params) + return chains +end; +``` + +Let us check how our mini probabilistic programming language works. +We define the probabilistic model: + +```{julia} +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end; +``` + +We perform inference with data `x = 3.0`: + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(), 1_000_000; chain_type=Chains, progress=false) +``` + +We compare these results with Turing. + +```{julia} +using Turing +using PDMats + +@model function turing_m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end + +sample(turing_m(3.0), MH(ScalMat(2, 1.0)), 1_000_000, progress=false) +``` + +As you can see, with our simple probabilistic programming language and custom samplers we get similar results as Turing. diff --git a/tutorials/16-contexts/index.qmd b/developers/compiler/minituring-contexts/index.qmd similarity index 97% rename from tutorials/16-contexts/index.qmd rename to developers/compiler/minituring-contexts/index.qmd index 01430f241..6468483ef 100755 --- a/tutorials/16-contexts/index.qmd +++ b/developers/compiler/minituring-contexts/index.qmd @@ -1,304 +1,306 @@ ---- -title: "A Mini Turing Implementation II: Contexts" -engine: julia ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In the [Mini Turing]({{< meta minituring >}}) tutorial we developed a miniature version of the Turing language, to illustrate its core design. A passing mention was made of contexts. In this tutorial we develop that aspect of our mini Turing language further to demonstrate how and why contexts are an important part of Turing's design. - -# Mini Turing expanded, now with more contexts - -If you haven't read [Mini Turing]({{< meta minituring >}}) yet, you should do that first. We start by repeating verbatim much of the code from there. Define the type for holding values for variables: - -```{julia} -import MacroTools, Random, AbstractMCMC -using Distributions: Normal, logpdf -using MCMCChains: Chains -using AbstractMCMC: sample - -struct VarInfo{V,L} - values::V - logps::L -end - -VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) - -function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) - varinfo.values[var_id] = value - varinfo.logps[var_id] = logp - return varinfo -end -``` - -Define the macro that expands `~` expressions to calls to `assume` and `observe`: - -```{julia} -# Methods will be defined for these later. -function assume end -function observe end - -macro mini_model(expr) - return esc(mini_model(expr)) -end - -function mini_model(expr) - # Split the function definition into a dictionary with its name, arguments, body etc. - def = MacroTools.splitdef(expr) - - # Replace tildes in the function body with calls to `assume` or `observe` - def[:body] = MacroTools.postwalk(def[:body]) do sub_expr - if MacroTools.@capture(sub_expr, var_ ~ dist_) - if var in def[:args] - # If the variable is an argument of the model function, it is observed - return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) - else - # Otherwise it is unobserved - return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) - end - else - return sub_expr - end - end - - # Add `context` and `varinfo` arguments to the model function - def[:args] = vcat(:varinfo, :context, def[:args]) - - # Reassemble the function definition from its name, arguments, body etc. - return MacroTools.combinedef(def) -end - - -struct MiniModel{F,D} <: AbstractMCMC.AbstractModel - f::F - data::D # a NamedTuple of all the data -end -``` - -Define an example model: - -```{julia} -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end; - -mini_m = MiniModel(m, (x=3.0,)) -``` - -Previously in the mini Turing case, at this point we defined `SamplingContext`, a structure that holds a random number generator and a sampler, and gets passed to `observe` and `assume`. We then used it to implement a simple Metropolis-Hastings sampler. - -The notion of a context may have seemed overly complicated just to implement the sampler, but there are other things we may want to do with a model than sample from the posterior. Having the context passing in place lets us do that without having to touch the above macro at all. For instance, let's say we want to evaluate the log joint probability of the model for a given set of data and parameters. Using a new context type we can use the previously defined `model` function, but change its behavior by changing what the `observe` and `assume` functions do. - - - -```{julia} -struct JointContext end - -function observe(context::JointContext, varinfo, dist, var_id, var_value) - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return nothing -end - -function assume(context::JointContext, varinfo, dist, var_id) - if !haskey(varinfo.values, var_id) - error("Can't evaluate the log probability if the variable $(var_id) is not set.") - end - var_value = varinfo.values[var_id] - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return var_value -end - -function logjoint(model, parameter_values::NamedTuple) - vi = VarInfo() - for (var_id, value) in pairs(parameter_values) - # Set the log prob to NaN for now. These will get overwritten when model.f is - # called with JointContext. - vi[var_id] = (value, NaN) - end - model.f(vi, JointContext(), values(model.data)...) - return sum(values(vi.logps)) -end - -logjoint(mini_m, (a=0.5, b=1.0)) -``` - -When using the `JointContext` no sampling whatsoever happens in calling `mini_m`. Rather only the log probability of each given variable value is evaluated. `logjoint` then sums these results to get the total log joint probability. - -We can similarly define a context for evaluating the log prior probability: - -```{julia} -struct PriorContext end - -function observe(context::PriorContext, varinfo, dist, var_id, var_value) - # Since we are evaluating the prior, the log probability of all the observations - # is set to 0. This has the effect of ignoring the likelihood. - varinfo[var_id] = (var_value, 0.0) - return nothing -end - -function assume(context::PriorContext, varinfo, dist, var_id) - if !haskey(varinfo.values, var_id) - error("Can't evaluate the log probability if the variable $(var_id) is not set.") - end - var_value = varinfo.values[var_id] - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return var_value -end - -function logprior(model, parameter_values::NamedTuple) - vi = VarInfo() - for (var_id, value) in pairs(parameter_values) - vi[var_id] = (value, NaN) - end - model.f(vi, PriorContext(), values(model.data)...) - return sum(values(vi.logps)) -end - -logprior(mini_m, (a=0.5, b=1.0)) -``` - -Notice that the definition of `assume(context::PriorContext, args...)` is identical to the one for `JointContext`, and `logprior` and `logjoint` are also identical except for the context type they create. There's clearly an opportunity here for some refactoring using abstract types, but that's outside the scope of this tutorial. Rather, the point here is to demonstrate that we can extract different sorts of things from our model by defining different context types, and specialising `observe` and `assume` for them. - - -## Contexts within contexts - -Let's use the above two contexts to provide a slightly more general definition of the `SamplingContext` and the Metropolis-Hastings sampler we wrote in the mini Turing tutorial. - -```{julia} -struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} - rng::R - sampler::S - subcontext::Union{PriorContext, JointContext} -end -``` - -The new aspect here is the `subcontext` field. Note that this is a context within a context! The idea is that we don't need to hard code how the MCMC sampler evaluates the log probability, but rather can pass that work onto the subcontext. This way the same sampler can be used to sample from either the joint or the prior distribution. - -The methods for `SamplingContext` are largely as in the our earlier mini Turing case, except they now pass some of the work onto the subcontext: - -```{julia} -function observe(context::SamplingContext, args...) - # Sampling doesn't affect the observed values, so nothing to do here other than pass to - # the subcontext. - return observe(context.subcontext, args...) -end - -struct PriorSampler <: AbstractMCMC.AbstractSampler end - -function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) - sample = Random.rand(context.rng, dist) - varinfo[var_id] = (sample, NaN) - # Once the value has been sampled, let the subcontext handle evaluating the log - # probability. - return assume(context.subcontext, varinfo, dist, var_id) -end; - -# The subcontext field of the MHSampler determines which distribution this sampler -# samples from. -struct MHSampler{D, T<:Real} <: AbstractMCMC.AbstractSampler - sigma::T - subcontext::D -end - -MHSampler(subcontext) = MHSampler(1, subcontext) - -function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) - sampler = context.sampler - old_value = varinfo.values[var_id] - - # propose a random-walk step, i.e, add the current value to a random - # value sampled from a Normal distribution centered at 0 - value = rand(context.rng, Normal(old_value, sampler.sigma)) - varinfo[var_id] = (value, NaN) - # Once the value has been sampled, let the subcontext handle evaluating the log - # probability. - return assume(context.subcontext, varinfo, dist, var_id) -end; - -# The following three methods are identical to before, except for passing -# `sampler.subcontext` to the context SamplingContext. -function AbstractMCMC.step( - rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... -) - vi = VarInfo() - ctx = SamplingContext(rng, PriorSampler(), sampler.subcontext) - model.f(vi, ctx, values(model.data)...) - return vi, vi -end - -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model::MiniModel, - sampler::MHSampler, - prev_state::VarInfo; # is just the old trace - kwargs..., -) - vi = prev_state - new_vi = deepcopy(vi) - ctx = SamplingContext(rng, sampler, sampler.subcontext) - model.f(new_vi, ctx, values(model.data)...) - - # Compute log acceptance probability - # Since the proposal is symmetric the computation can be simplified - logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) - - # Accept proposal with computed acceptance probability - if -Random.randexp(rng) < logα - return new_vi, new_vi - else - return prev_state, prev_state - end -end; - -function AbstractMCMC.bundle_samples( - samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... -) - # We get a vector of traces - values = [sample.values for sample in samples] - params = [key for key in keys(values[1]) if key ∉ keys(model.data)] - vals = reduce(hcat, [value[p] for value in values] for p in params) - # Composing the `Chains` data-structure, of which analyzing infrastructure is provided - chains = Chains(vals, params) - return chains -end; -``` - -We can use this to sample from the joint distribution just like before: - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(JointContext()), 1_000_000; chain_type=Chains, progress=false) -``` - -or we can choose to sample from the prior instead - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(PriorContext()), 1_000_000; chain_type=Chains, progress=false) -``` - -Of course, using an MCMC algorithm to sample from the prior is unnecessary and silly (`PriorSampler` exists, after all), but the point is to illustrate the flexibility of the context system. We could, for instance, use the same setup to implement an _Approximate Bayesian Computation_ (ABC) algorithm. - - -The use of contexts also goes far beyond just evaluating log probabilities and sampling. Some examples from Turing are - -* `FixedContext`, which fixes some variables to given values and removes them completely from the evaluation of any log probabilities. They power the `Turing.fix` and `Turing.unfix` functions. -* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.uncondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. - -* `PriorExtractorContext` collects information about what the prior distribution of each variable is. -* `PrefixContext` adds prefixes to variable names, allowing models to be used within other models without variable name collisions. -* `PointwiseLikelihoodContext` records the log likelihood of each individual variable. -* `DebugContext` collects useful debugging information while executing the model. - -All of the above are what Turing calls _parent contexts_, which is to say that they all keep a subcontext just like our above `SamplingContext` did. Their implementations of `assume` and `observe` call the implementation of the subcontext once they are done doing their own work of fixing/conditioning/prefixing/etc. Contexts are often chained, so that e.g. a `DebugContext` may wrap within it a `PrefixContext`, which may in turn wrap a `ConditionContext`, etc. The only contexts that _don't_ have a subcontext in the Turing are the ones for evaluating the prior, likelihood, and joint distributions. These are called _leaf contexts_. - -The above version of mini Turing is still much simpler than the full Turing language, but the principles of how contexts are used are the same. +--- +title: "A Mini Turing Implementation II: Contexts" +engine: julia +aliases: + - ../../../tutorials/16-contexts/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In the [Mini Turing]({{< meta minituring >}}) tutorial we developed a miniature version of the Turing language, to illustrate its core design. A passing mention was made of contexts. In this tutorial we develop that aspect of our mini Turing language further to demonstrate how and why contexts are an important part of Turing's design. + +# Mini Turing expanded, now with more contexts + +If you haven't read [Mini Turing]({{< meta minituring >}}) yet, you should do that first. We start by repeating verbatim much of the code from there. Define the type for holding values for variables: + +```{julia} +import MacroTools, Random, AbstractMCMC +using Distributions: Normal, logpdf +using MCMCChains: Chains +using AbstractMCMC: sample + +struct VarInfo{V,L} + values::V + logps::L +end + +VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) + +function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) + varinfo.values[var_id] = value + varinfo.logps[var_id] = logp + return varinfo +end +``` + +Define the macro that expands `~` expressions to calls to `assume` and `observe`: + +```{julia} +# Methods will be defined for these later. +function assume end +function observe end + +macro mini_model(expr) + return esc(mini_model(expr)) +end + +function mini_model(expr) + # Split the function definition into a dictionary with its name, arguments, body etc. + def = MacroTools.splitdef(expr) + + # Replace tildes in the function body with calls to `assume` or `observe` + def[:body] = MacroTools.postwalk(def[:body]) do sub_expr + if MacroTools.@capture(sub_expr, var_ ~ dist_) + if var in def[:args] + # If the variable is an argument of the model function, it is observed + return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) + else + # Otherwise it is unobserved + return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) + end + else + return sub_expr + end + end + + # Add `context` and `varinfo` arguments to the model function + def[:args] = vcat(:varinfo, :context, def[:args]) + + # Reassemble the function definition from its name, arguments, body etc. + return MacroTools.combinedef(def) +end + + +struct MiniModel{F,D} <: AbstractMCMC.AbstractModel + f::F + data::D # a NamedTuple of all the data +end +``` + +Define an example model: + +```{julia} +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end; + +mini_m = MiniModel(m, (x=3.0,)) +``` + +Previously in the mini Turing case, at this point we defined `SamplingContext`, a structure that holds a random number generator and a sampler, and gets passed to `observe` and `assume`. We then used it to implement a simple Metropolis-Hastings sampler. + +The notion of a context may have seemed overly complicated just to implement the sampler, but there are other things we may want to do with a model than sample from the posterior. Having the context passing in place lets us do that without having to touch the above macro at all. For instance, let's say we want to evaluate the log joint probability of the model for a given set of data and parameters. Using a new context type we can use the previously defined `model` function, but change its behavior by changing what the `observe` and `assume` functions do. + + + +```{julia} +struct JointContext end + +function observe(context::JointContext, varinfo, dist, var_id, var_value) + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return nothing +end + +function assume(context::JointContext, varinfo, dist, var_id) + if !haskey(varinfo.values, var_id) + error("Can't evaluate the log probability if the variable $(var_id) is not set.") + end + var_value = varinfo.values[var_id] + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return var_value +end + +function logjoint(model, parameter_values::NamedTuple) + vi = VarInfo() + for (var_id, value) in pairs(parameter_values) + # Set the log prob to NaN for now. These will get overwritten when model.f is + # called with JointContext. + vi[var_id] = (value, NaN) + end + model.f(vi, JointContext(), values(model.data)...) + return sum(values(vi.logps)) +end + +logjoint(mini_m, (a=0.5, b=1.0)) +``` + +When using the `JointContext` no sampling whatsoever happens in calling `mini_m`. Rather only the log probability of each given variable value is evaluated. `logjoint` then sums these results to get the total log joint probability. + +We can similarly define a context for evaluating the log prior probability: + +```{julia} +struct PriorContext end + +function observe(context::PriorContext, varinfo, dist, var_id, var_value) + # Since we are evaluating the prior, the log probability of all the observations + # is set to 0. This has the effect of ignoring the likelihood. + varinfo[var_id] = (var_value, 0.0) + return nothing +end + +function assume(context::PriorContext, varinfo, dist, var_id) + if !haskey(varinfo.values, var_id) + error("Can't evaluate the log probability if the variable $(var_id) is not set.") + end + var_value = varinfo.values[var_id] + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return var_value +end + +function logprior(model, parameter_values::NamedTuple) + vi = VarInfo() + for (var_id, value) in pairs(parameter_values) + vi[var_id] = (value, NaN) + end + model.f(vi, PriorContext(), values(model.data)...) + return sum(values(vi.logps)) +end + +logprior(mini_m, (a=0.5, b=1.0)) +``` + +Notice that the definition of `assume(context::PriorContext, args...)` is identical to the one for `JointContext`, and `logprior` and `logjoint` are also identical except for the context type they create. There's clearly an opportunity here for some refactoring using abstract types, but that's outside the scope of this tutorial. Rather, the point here is to demonstrate that we can extract different sorts of things from our model by defining different context types, and specialising `observe` and `assume` for them. + + +## Contexts within contexts + +Let's use the above two contexts to provide a slightly more general definition of the `SamplingContext` and the Metropolis-Hastings sampler we wrote in the mini Turing tutorial. + +```{julia} +struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} + rng::R + sampler::S + subcontext::Union{PriorContext, JointContext} +end +``` + +The new aspect here is the `subcontext` field. Note that this is a context within a context! The idea is that we don't need to hard code how the MCMC sampler evaluates the log probability, but rather can pass that work onto the subcontext. This way the same sampler can be used to sample from either the joint or the prior distribution. + +The methods for `SamplingContext` are largely as in the our earlier mini Turing case, except they now pass some of the work onto the subcontext: + +```{julia} +function observe(context::SamplingContext, args...) + # Sampling doesn't affect the observed values, so nothing to do here other than pass to + # the subcontext. + return observe(context.subcontext, args...) +end + +struct PriorSampler <: AbstractMCMC.AbstractSampler end + +function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) + sample = Random.rand(context.rng, dist) + varinfo[var_id] = (sample, NaN) + # Once the value has been sampled, let the subcontext handle evaluating the log + # probability. + return assume(context.subcontext, varinfo, dist, var_id) +end; + +# The subcontext field of the MHSampler determines which distribution this sampler +# samples from. +struct MHSampler{D, T<:Real} <: AbstractMCMC.AbstractSampler + sigma::T + subcontext::D +end + +MHSampler(subcontext) = MHSampler(1, subcontext) + +function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) + sampler = context.sampler + old_value = varinfo.values[var_id] + + # propose a random-walk step, i.e, add the current value to a random + # value sampled from a Normal distribution centered at 0 + value = rand(context.rng, Normal(old_value, sampler.sigma)) + varinfo[var_id] = (value, NaN) + # Once the value has been sampled, let the subcontext handle evaluating the log + # probability. + return assume(context.subcontext, varinfo, dist, var_id) +end; + +# The following three methods are identical to before, except for passing +# `sampler.subcontext` to the context SamplingContext. +function AbstractMCMC.step( + rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... +) + vi = VarInfo() + ctx = SamplingContext(rng, PriorSampler(), sampler.subcontext) + model.f(vi, ctx, values(model.data)...) + return vi, vi +end + +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model::MiniModel, + sampler::MHSampler, + prev_state::VarInfo; # is just the old trace + kwargs..., +) + vi = prev_state + new_vi = deepcopy(vi) + ctx = SamplingContext(rng, sampler, sampler.subcontext) + model.f(new_vi, ctx, values(model.data)...) + + # Compute log acceptance probability + # Since the proposal is symmetric the computation can be simplified + logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) + + # Accept proposal with computed acceptance probability + if -Random.randexp(rng) < logα + return new_vi, new_vi + else + return prev_state, prev_state + end +end; + +function AbstractMCMC.bundle_samples( + samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... +) + # We get a vector of traces + values = [sample.values for sample in samples] + params = [key for key in keys(values[1]) if key ∉ keys(model.data)] + vals = reduce(hcat, [value[p] for value in values] for p in params) + # Composing the `Chains` data-structure, of which analyzing infrastructure is provided + chains = Chains(vals, params) + return chains +end; +``` + +We can use this to sample from the joint distribution just like before: + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(JointContext()), 1_000_000; chain_type=Chains, progress=false) +``` + +or we can choose to sample from the prior instead + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(PriorContext()), 1_000_000; chain_type=Chains, progress=false) +``` + +Of course, using an MCMC algorithm to sample from the prior is unnecessary and silly (`PriorSampler` exists, after all), but the point is to illustrate the flexibility of the context system. We could, for instance, use the same setup to implement an _Approximate Bayesian Computation_ (ABC) algorithm. + + +The use of contexts also goes far beyond just evaluating log probabilities and sampling. Some examples from Turing are + +* `FixedContext`, which fixes some variables to given values and removes them completely from the evaluation of any log probabilities. They power the `Turing.fix` and `Turing.unfix` functions. +* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.uncondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. + +* `PriorExtractorContext` collects information about what the prior distribution of each variable is. +* `PrefixContext` adds prefixes to variable names, allowing models to be used within other models without variable name collisions. +* `PointwiseLikelihoodContext` records the log likelihood of each individual variable. +* `DebugContext` collects useful debugging information while executing the model. + +All of the above are what Turing calls _parent contexts_, which is to say that they all keep a subcontext just like our above `SamplingContext` did. Their implementations of `assume` and `observe` call the implementation of the subcontext once they are done doing their own work of fixing/conditioning/prefixing/etc. Contexts are often chained, so that e.g. a `DebugContext` may wrap within it a `PrefixContext`, which may in turn wrap a `ConditionContext`, etc. The only contexts that _don't_ have a subcontext in the Turing are the ones for evaluating the prior, likelihood, and joint distributions. These are called _leaf contexts_. + +The above version of mini Turing is still much simpler than the full Turing language, but the principles of how contexts are used are the same. diff --git a/tutorials/dev-model-manual/index.qmd b/developers/compiler/model-manual/index.qmd similarity index 96% rename from tutorials/dev-model-manual/index.qmd rename to developers/compiler/model-manual/index.qmd index bc4205695..24cb77365 100755 --- a/tutorials/dev-model-manual/index.qmd +++ b/developers/compiler/model-manual/index.qmd @@ -1,6 +1,8 @@ --- title: Manually Defining a Model engine: julia +aliases: + - ../../../tutorials/dev-model-manual/index.html --- Traditionally, models in Turing are defined using the `@model` macro: diff --git a/tutorials/docs-01-contributing-guide/index.qmd b/developers/contributing/index.qmd similarity index 98% rename from tutorials/docs-01-contributing-guide/index.qmd rename to developers/contributing/index.qmd index 78d240708..00040e7e2 100755 --- a/tutorials/docs-01-contributing-guide/index.qmd +++ b/developers/contributing/index.qmd @@ -1,76 +1,78 @@ ---- -title: Contributing ---- - -Turing is an open-source project and is [hosted on GitHub](https://github.com/TuringLang). -We welcome contributions from the community in all forms large or small: bug reports, feature implementations, code contributions, or improvements to documentation or infrastructure are all extremely valuable. -We would also very much appreciate examples of models written using Turing. - -### How to get involved - -Our outstanding issues are tabulated on our [issue tracker](https://github.com/TuringLang/Turing.jl/issues). -Closing one of these may involve implementing new features, fixing bugs, or writing example models. - -You can also join the `#turing` channel on the [Julia Slack](https://julialang.org/slack/) and say hello! - -If you are new to open-source software, please see [GitHub's introduction](https://guides.github.com/introduction/flow/) or [Julia's contribution guide](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md) on using version control for collaboration. - -### Documentation - -Each of the packages in the Turing ecosystem (see [Libraries](/library)) has its own documentation, which is typically found in the `docs` folder of the corresponding package. -For example, the source code for DynamicPPL's documentation can be found in [its repository](https://github.com/TuringLang/DynamicPPL.jl). - -The documentation for Turing.jl itself consists of the tutorials that you see on this website, and is built from the separate [`docs` repository](https://github.com/TuringLang/docs). -None of the documentation is generated from the [main Turing.jl repository](https://github.com/TuringLang/Turing.jl); in particular, the API that Turing exports does not currently form part of the documentation. - -Other sections of the website (anything that isn't a package, or a tutorial) – for example, the list of libraries – is built from the [`turinglang.github.io` repository](https://github.com/TuringLang/turinglang.github.io). - -### Tests - -Turing, like most software libraries, has a test suite. You can run the whole suite by running `julia --project=.` from the root of the Turing repository, and then running - -```julia -import Pkg; Pkg.test("Turing") -``` - -The test suite subdivides into files in the `test` folder, and you can run only some of them using commands like - -```julia -import Pkg; Pkg.test("Turing"; test_args=["optim", "hmc", "--skip", "ext"]) -``` - -This one would run all files with "optim" or "hmc" in their path, such as `test/optimisation/Optimisation.jl`, but not files with "ext" in their path. Alternatively, you can set these arguments as command line arguments when you run Julia - -```julia -julia --project=. -e 'import Pkg; Pkg.test(; test_args=ARGS)' -- optim hmc --skip ext -``` - -Or otherwise, set the global `ARGS` variable, and call `include("test/runtests.jl")`. - -### Style Guide - -Turing has a style guide, described below. -Reviewing it before making a pull request is not strictly necessary, but you may be asked to change portions of your code to conform with the style guide before it is merged. - -Most Turing code follows [Blue: a Style Guide for Julia](https://github.com/JuliaDiff/BlueStyle). -These conventions were created from a variety of sources including Python's [PEP8](http://legacy.python.org/dev/peps/pep-0008/), Julia's [Notes for Contributors](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md), and Julia's [Style Guide](https://docs.julialang.org/en/v1/manual/style-guide/). - -#### Synopsis - - - Use 4 spaces per indentation level, no tabs. - - Try to adhere to a 92 character line length limit. - - Use upper camel case convention for [modules](https://docs.julialang.org/en/v1/manual/modules/) and [types](https://docs.julialang.org/en/v1/manual/types/). - - Use lower case with underscores for method names (note: Julia code likes to use lower case without underscores). - - Comments are good, try to explain the intentions of the code. - - Use whitespace to make the code more readable. - - No whitespace at the end of a line (trailing whitespace). - - Avoid padding brackets with spaces. ex. `Int64(value)` preferred over `Int64( value )`. - -#### A Word on Consistency - -When adhering to the Blue style, it's important to realize that these are guidelines, not rules. This is [stated best in the PEP8](http://legacy.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds): - -> A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is most important. - -> But most importantly: know when to be inconsistent – sometimes the style guide just doesn't apply. When in doubt, use your best judgment. Look at other examples and decide what looks best. And don't hesitate to ask! - +--- +title: Contributing +aliases: + - ../../tutorials/docs-01-contributing-guide/index.html +--- + +Turing is an open-source project and is [hosted on GitHub](https://github.com/TuringLang). +We welcome contributions from the community in all forms large or small: bug reports, feature implementations, code contributions, or improvements to documentation or infrastructure are all extremely valuable. +We would also very much appreciate examples of models written using Turing. + +### How to get involved + +Our outstanding issues are tabulated on our [issue tracker](https://github.com/TuringLang/Turing.jl/issues). +Closing one of these may involve implementing new features, fixing bugs, or writing example models. + +You can also join the `#turing` channel on the [Julia Slack](https://julialang.org/slack/) and say hello! + +If you are new to open-source software, please see [GitHub's introduction](https://guides.github.com/introduction/flow/) or [Julia's contribution guide](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md) on using version control for collaboration. + +### Documentation + +Each of the packages in the Turing ecosystem (see [Libraries](/library)) has its own documentation, which is typically found in the `docs` folder of the corresponding package. +For example, the source code for DynamicPPL's documentation can be found in [its repository](https://github.com/TuringLang/DynamicPPL.jl). + +The documentation for Turing.jl itself consists of the tutorials that you see on this website, and is built from the separate [`docs` repository](https://github.com/TuringLang/docs). +None of the documentation is generated from the [main Turing.jl repository](https://github.com/TuringLang/Turing.jl); in particular, the API that Turing exports does not currently form part of the documentation. + +Other sections of the website (anything that isn't a package, or a tutorial) – for example, the list of libraries – is built from the [`turinglang.github.io` repository](https://github.com/TuringLang/turinglang.github.io). + +### Tests + +Turing, like most software libraries, has a test suite. You can run the whole suite by running `julia --project=.` from the root of the Turing repository, and then running + +```julia +import Pkg; Pkg.test("Turing") +``` + +The test suite subdivides into files in the `test` folder, and you can run only some of them using commands like + +```julia +import Pkg; Pkg.test("Turing"; test_args=["optim", "hmc", "--skip", "ext"]) +``` + +This one would run all files with "optim" or "hmc" in their path, such as `test/optimisation/Optimisation.jl`, but not files with "ext" in their path. Alternatively, you can set these arguments as command line arguments when you run Julia + +```julia +julia --project=. -e 'import Pkg; Pkg.test(; test_args=ARGS)' -- optim hmc --skip ext +``` + +Or otherwise, set the global `ARGS` variable, and call `include("test/runtests.jl")`. + +### Style Guide + +Turing has a style guide, described below. +Reviewing it before making a pull request is not strictly necessary, but you may be asked to change portions of your code to conform with the style guide before it is merged. + +Most Turing code follows [Blue: a Style Guide for Julia](https://github.com/JuliaDiff/BlueStyle). +These conventions were created from a variety of sources including Python's [PEP8](http://legacy.python.org/dev/peps/pep-0008/), Julia's [Notes for Contributors](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md), and Julia's [Style Guide](https://docs.julialang.org/en/v1/manual/style-guide/). + +#### Synopsis + + - Use 4 spaces per indentation level, no tabs. + - Try to adhere to a 92 character line length limit. + - Use upper camel case convention for [modules](https://docs.julialang.org/en/v1/manual/modules/) and [types](https://docs.julialang.org/en/v1/manual/types/). + - Use lower case with underscores for method names (note: Julia code likes to use lower case without underscores). + - Comments are good, try to explain the intentions of the code. + - Use whitespace to make the code more readable. + - No whitespace at the end of a line (trailing whitespace). + - Avoid padding brackets with spaces. ex. `Int64(value)` preferred over `Int64( value )`. + +#### A Word on Consistency + +When adhering to the Blue style, it's important to realize that these are guidelines, not rules. This is [stated best in the PEP8](http://legacy.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds): + +> A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is most important. + +> But most importantly: know when to be inconsistent – sometimes the style guide just doesn't apply. When in doubt, use your best judgment. Look at other examples and decide what looks best. And don't hesitate to ask! + diff --git a/tutorials/docs-06-for-developers-interface/index.qmd b/developers/inference/abstractmcmc-interface/index.qmd similarity index 97% rename from tutorials/docs-06-for-developers-interface/index.qmd rename to developers/inference/abstractmcmc-interface/index.qmd index 1aba48670..993936209 100755 --- a/tutorials/docs-06-for-developers-interface/index.qmd +++ b/developers/inference/abstractmcmc-interface/index.qmd @@ -1,321 +1,323 @@ ---- -title: Interface Guide -engine: julia ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -# The sampling interface - -Turing implements a sampling interface (hosted at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl)) that is intended to provide a common framework for Markov chain Monte Carlo samplers. The interface presents several structures and functions that one needs to overload in order to implement an interface-compatible sampler. - -This guide will demonstrate how to implement the interface without Turing. - -## Interface overview - -Any implementation of an inference method that uses the AbstractMCMC interface should implement a subset of the following types and functions: - -1. A subtype of `AbstractSampler`, defined as a mutable struct containing state information or sampler parameters. -2. A function `sample_init!` which performs any necessary set-up (default: do not perform any set-up). -3. A function `step!` which returns a transition that represents a single draw from the sampler. -4. A function `transitions_init` which returns a container for the transitions obtained from the sampler (default: return a `Vector{T}` of length `N` where `T` is the type of the transition obtained in the first step and `N` is the number of requested samples). -5. A function `transitions_save!` which saves transitions to the container (default: save the transition of iteration `i` at position `i` in the vector of transitions). -6. A function `sample_end!` which handles any sampler wrap-up (default: do not perform any wrap-up). -7. A function `bundle_samples` which accepts the container of transitions and returns a collection of samples (default: return the vector of transitions). - -The interface methods with exclamation points are those that are intended to allow for state mutation. Any mutating function is meant to allow mutation where needed -- you might use: - -- `sample_init!` to run some kind of sampler preparation, before sampling begins. This could mutate a sampler's state. -- `step!` might mutate a sampler flag after each sample. -- `sample_end!` contains any wrap-up you might need to do. If you were sampling in a transformed space, this might be where you convert everything back to a constrained space. - -## Why do you have an interface? - -The motivation for the interface is to allow Julia's fantastic probabilistic programming language community to have a set of standards and common implementations so we can all thrive together. Markov chain Monte Carlo methods tend to have a very similar framework to one another, and so a common interface should help more great inference methods built in single-purpose packages to experience more use among the community. - -## Implementing Metropolis-Hastings without Turing - -[Metropolis-Hastings](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo) is often the first sampling method that people are exposed to. It is a very straightforward algorithm and is accordingly the easiest to implement, so it makes for a good example. In this section, you will learn how to use the types and functions listed above to implement the Metropolis-Hastings sampler using the MCMC interface. - -The full code for this implementation is housed in [AdvancedMH.jl](https://github.com/TuringLang/AdvancedMH.jl). - -### Imports - -Let's begin by importing the relevant libraries. We'll import `AbstractMCMC`, which contains the interface framework we'll fill out. We also need `Distributions` and `Random`. - -```{julia} -# Import the relevant libraries. -using AbstractMCMC: AbstractMCMC -using Distributions -using Random -``` - -An interface extension (like the one we're writing right now) typically requires that you overload or implement several functions. Specifically, you should `import` the functions you intend to overload. This next code block accomplishes that. - -From `Distributions`, we need `Sampleable`, `VariateForm`, and `ValueSupport`, three abstract types that define a distribution. Models in the interface are assumed to be subtypes of `Sampleable{VariateForm, ValueSupport}`. In this section our model is going be be extremely simple, so we will not end up using these except to make sure that the inference functions are dispatching correctly. - -### Sampler - -Let's begin our sampler definition by defining a sampler called `MetropolisHastings` which is a subtype of `AbstractSampler`. Correct typing is very important for proper interface implementation -- if you are missing a subtype, your method may not be dispatched to when you call `sample`. - -```{julia} -# Define a sampler type. -struct MetropolisHastings{T,D} <: AbstractMCMC.AbstractSampler - init_θ::T - proposal::D -end - -# Default constructors. -MetropolisHastings(init_θ::Real) = MetropolisHastings(init_θ, Normal(0, 1)) -function MetropolisHastings(init_θ::Vector{<:Real}) - return MetropolisHastings(init_θ, MvNormal(zero(init_θ), I)) -end -``` - -Above, we have defined a sampler that stores the initial parameterization of the prior, and a distribution object from which proposals are drawn. You can have a struct that has no fields, and simply use it for dispatching onto the relevant functions, or you can store a large amount of state information in your sampler. - -The general intuition for what to store in your sampler struct is that anything you may need to perform inference between samples but you don't want to store in a transition should go into the sampler struct. It's the only way you can carry non-sample related state information between `step!` calls. - -### Model - -Next, we need to have a model of some kind. A model is a struct that's a subtype of `AbstractModel` that contains whatever information is necessary to perform inference on your problem. In our case we want to know the mean and variance parameters for a standard Normal distribution, so we can keep our model to the log density of a Normal. - -Note that we only have to do this because we are not yet integrating the sampler with Turing -- Turing has a very sophisticated modelling engine that removes the need to define custom model structs. - -```{julia} -# Define a model type. Stores the log density function. -struct DensityModel{F<:Function} <: AbstractMCMC.AbstractModel - ℓπ::F -end -``` - -### Transition - -The next step is to define some transition which we will return from each `step!` call. We'll keep it simple by just defining a wrapper struct that contains the parameter draws and the log density of that draw: - -```{julia} -# Create a very basic Transition type, only stores the -# parameter draws and the log probability of the draw. -struct Transition{T,L} - θ::T - lp::L -end - -# Store the new draw and its log density. -Transition(model::DensityModel, θ) = Transition(θ, ℓπ(model, θ)) -``` - -`Transition` can now store any type of parameter, whether it's a vector of draws from multiple parameters or a single univariate draw. - -### Metropolis-Hastings - -Now it's time to get into the actual inference. We've defined all of the core pieces we need, but we need to implement the `step!` function which actually performs inference. - -As a refresher, Metropolis-Hastings implements a very basic algorithm: - -1. Pick some initial state, ``\theta_0``. - -2. For ``t`` in ``[1,N],`` do - - + Generate a proposal parameterization ``\theta^\prime_t \sim q(\theta^\prime_t \mid \theta_{t-1}).`` - - + Calculate the acceptance probability, ``\alpha = \text{min}\left[1,\frac{\pi(\theta'_t)}{\pi(\theta_{t-1})} \frac{q(\theta_{t-1} \mid \theta'_t)}{q(\theta'_t \mid \theta_{t-1})}) \right].`` - - + If ``U \le \alpha`` where ``U \sim [0,1],`` then ``\theta_t = \theta'_t.`` Otherwise, ``\theta_t = \theta_{t-1}.`` - -Of course, it's much easier to do this in the log space, so the acceptance probability is more commonly written as - -```{.cell-bg} -\log \alpha = \min\left[0, \log \pi(\theta'_t) - \log \pi(\theta_{t-1}) + \log q(\theta_{t-1} \mid \theta^\prime_t) - \log q(\theta\prime_t \mid \theta_{t-1}) \right]. -``` - -In interface terms, we should do the following: - -1. Make a new transition containing a proposed sample. -2. Calculate the acceptance probability. -3. If we accept, return the new transition, otherwise, return the old one. - -### Steps - -The `step!` function is the function that performs the bulk of your inference. In our case, we will implement two `step!` functions -- one for the very first iteration, and one for every subsequent iteration. - -```{julia} -#| eval: false -# Define the first step! function, which is called at the -# beginning of sampling. Return the initial parameter used -# to define the sampler. -function AbstractMCMC.step!( - rng::AbstractRNG, - model::DensityModel, - spl::MetropolisHastings, - N::Integer, - ::Nothing; - kwargs..., -) - return Transition(model, spl.init_θ) -end -``` - -The first `step!` function just packages up the initial parameterization inside the sampler, and returns it. We implicitly accept the very first parameterization. - -The other `step!` function performs the usual steps from Metropolis-Hastings. Included are several helper functions, `proposal` and `q`, which are designed to replicate the functions in the pseudocode above. - -- `proposal` generates a new proposal in the form of a `Transition`, which can be univariate if the value passed in is univariate, or it can be multivariate if the `Transition` given is multivariate. Proposals use a basic `Normal` or `MvNormal` proposal distribution. -- `q` returns the log density of one parameterization conditional on another, according to the proposal distribution. -- `step!` generates a new proposal, checks the acceptance probability, and then returns either the previous transition or the proposed transition. - - -```{julia} -#| eval: false -# Define a function that makes a basic proposal depending on a univariate -# parameterization or a multivariate parameterization. -function propose(spl::MetropolisHastings, model::DensityModel, θ::Real) - return Transition(model, θ + rand(spl.proposal)) -end -function propose(spl::MetropolisHastings, model::DensityModel, θ::Vector{<:Real}) - return Transition(model, θ + rand(spl.proposal)) -end -function propose(spl::MetropolisHastings, model::DensityModel, t::Transition) - return propose(spl, model, t.θ) -end - -# Calculates the probability `q(θ|θcond)`, using the proposal distribution `spl.proposal`. -q(spl::MetropolisHastings, θ::Real, θcond::Real) = logpdf(spl.proposal, θ - θcond) -function q(spl::MetropolisHastings, θ::Vector{<:Real}, θcond::Vector{<:Real}) - return logpdf(spl.proposal, θ - θcond) -end -q(spl::MetropolisHastings, t1::Transition, t2::Transition) = q(spl, t1.θ, t2.θ) - -# Calculate the density of the model given some parameterization. -ℓπ(model::DensityModel, θ) = model.ℓπ(θ) -ℓπ(model::DensityModel, t::Transition) = t.lp - -# Define the other step function. Returns a Transition containing -# either a new proposal (if accepted) or the previous proposal -# (if not accepted). -function AbstractMCMC.step!( - rng::AbstractRNG, - model::DensityModel, - spl::MetropolisHastings, - ::Integer, - θ_prev::Transition; - kwargs..., -) - # Generate a new proposal. - θ = propose(spl, model, θ_prev) - - # Calculate the log acceptance probability. - α = ℓπ(model, θ) - ℓπ(model, θ_prev) + q(spl, θ_prev, θ) - q(spl, θ, θ_prev) - - # Decide whether to return the previous θ or the new one. - if log(rand(rng)) < min(α, 0.0) - return θ - else - return θ_prev - end -end -``` - -### Chains - -In the default implementation, `sample` just returns a vector of all transitions. If instead you would like to obtain a `Chains` object (e.g., to simplify downstream analysis), you have to implement the `bundle_samples` function as well. It accepts the vector of transitions and returns a collection of samples. Fortunately, our `Transition` is incredibly simple, and we only need to build a little bit of functionality to accept custom parameter names passed in by the user. - -```{julia} -#| eval: false -# A basic chains constructor that works with the Transition struct we defined. -function AbstractMCMC.bundle_samples( - rng::AbstractRNG, - ℓ::DensityModel, - s::MetropolisHastings, - N::Integer, - ts::Vector{<:Transition}, - chain_type::Type{Any}; - param_names=missing, - kwargs..., -) - # Turn all the transitions into a vector-of-vectors. - vals = copy(reduce(hcat, [vcat(t.θ, t.lp) for t in ts])') - - # Check if we received any parameter names. - if ismissing(param_names) - param_names = ["Parameter $i" for i in 1:(length(first(vals)) - 1)] - end - - # Add the log density field to the parameter names. - push!(param_names, "lp") - - # Bundle everything up and return a Chains struct. - return Chains(vals, param_names, (internals=["lp"],)) -end -``` - -All done! - -You can even implement different output formats by implementing `bundle_samples` for different `chain_type`s, which can be provided as keyword argument to `sample`. As default `sample` uses `chain_type = Any`. - -### Testing the implementation - -Now that we have all the pieces, we should test the implementation by defining a model to calculate the mean and variance parameters of a Normal distribution. We can do this by constructing a target density function, providing a sample of data, and then running the sampler with `sample`. - -```{julia} -#| eval: false -# Generate a set of data from the posterior we want to estimate. -data = rand(Normal(5, 3), 30) - -# Define the components of a basic model. -insupport(θ) = θ[2] >= 0 -dist(θ) = Normal(θ[1], θ[2]) -density(θ) = insupport(θ) ? sum(logpdf.(dist(θ), data)) : -Inf - -# Construct a DensityModel. -model = DensityModel(density) - -# Set up our sampler with initial parameters. -spl = MetropolisHastings([0.0, 0.0]) - -# Sample from the posterior. -chain = sample(model, spl, 100000; param_names=["μ", "σ"]) -``` - -If all the interface functions have been extended properly, you should get an output from `display(chain)` that looks something like this: - - -```{.cell-bg} -Object of type Chains, with data of type 100000×3×1 Array{Float64,3} - -Iterations = 1:100000 -Thinning interval = 1 -Chains = 1 -Samples per chain = 100000 -internals = lp -parameters = μ, σ - -2-element Array{ChainDataFrame,1} - -Summary Statistics - -│ Row │ parameters │ mean │ std │ naive_se │ mcse │ ess │ r_hat │ -│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Any │ Any │ -├─────┼────────────┼─────────┼──────────┼────────────┼────────────┼─────────┼─────────┤ -│ 1 │ μ │ 5.33157 │ 0.854193 │ 0.0027012 │ 0.00893069 │ 8344.75 │ 1.00009 │ -│ 2 │ σ │ 4.54992 │ 0.632916 │ 0.00200146 │ 0.00534942 │ 14260.8 │ 1.00005 │ - -Quantiles - -│ Row │ parameters │ 2.5% │ 25.0% │ 50.0% │ 75.0% │ 97.5% │ -│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ -├─────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ -│ 1 │ μ │ 3.6595 │ 4.77754 │ 5.33182 │ 5.89509 │ 6.99651 │ -│ 2 │ σ │ 3.5097 │ 4.09732 │ 4.47805 │ 4.93094 │ 5.96821 │ -``` - -It looks like we're extremely close to our true parameters of `Normal(5,3)`, though with a fairly high variance due to the low sample size. - -## Conclusion - -We've seen how to implement the sampling interface for general projects. Turing's interface methods are ever-evolving, so please open an issue at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) with feature requests or problems. \ No newline at end of file +--- +title: Interface Guide +engine: julia +aliases: + - ../../tutorials/docs-06-for-developers-interface/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +# The sampling interface + +Turing implements a sampling interface (hosted at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl)) that is intended to provide a common framework for Markov chain Monte Carlo samplers. The interface presents several structures and functions that one needs to overload in order to implement an interface-compatible sampler. + +This guide will demonstrate how to implement the interface without Turing. + +## Interface overview + +Any implementation of an inference method that uses the AbstractMCMC interface should implement a subset of the following types and functions: + +1. A subtype of `AbstractSampler`, defined as a mutable struct containing state information or sampler parameters. +2. A function `sample_init!` which performs any necessary set-up (default: do not perform any set-up). +3. A function `step!` which returns a transition that represents a single draw from the sampler. +4. A function `transitions_init` which returns a container for the transitions obtained from the sampler (default: return a `Vector{T}` of length `N` where `T` is the type of the transition obtained in the first step and `N` is the number of requested samples). +5. A function `transitions_save!` which saves transitions to the container (default: save the transition of iteration `i` at position `i` in the vector of transitions). +6. A function `sample_end!` which handles any sampler wrap-up (default: do not perform any wrap-up). +7. A function `bundle_samples` which accepts the container of transitions and returns a collection of samples (default: return the vector of transitions). + +The interface methods with exclamation points are those that are intended to allow for state mutation. Any mutating function is meant to allow mutation where needed -- you might use: + +- `sample_init!` to run some kind of sampler preparation, before sampling begins. This could mutate a sampler's state. +- `step!` might mutate a sampler flag after each sample. +- `sample_end!` contains any wrap-up you might need to do. If you were sampling in a transformed space, this might be where you convert everything back to a constrained space. + +## Why do you have an interface? + +The motivation for the interface is to allow Julia's fantastic probabilistic programming language community to have a set of standards and common implementations so we can all thrive together. Markov chain Monte Carlo methods tend to have a very similar framework to one another, and so a common interface should help more great inference methods built in single-purpose packages to experience more use among the community. + +## Implementing Metropolis-Hastings without Turing + +[Metropolis-Hastings](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo) is often the first sampling method that people are exposed to. It is a very straightforward algorithm and is accordingly the easiest to implement, so it makes for a good example. In this section, you will learn how to use the types and functions listed above to implement the Metropolis-Hastings sampler using the MCMC interface. + +The full code for this implementation is housed in [AdvancedMH.jl](https://github.com/TuringLang/AdvancedMH.jl). + +### Imports + +Let's begin by importing the relevant libraries. We'll import `AbstractMCMC`, which contains the interface framework we'll fill out. We also need `Distributions` and `Random`. + +```{julia} +# Import the relevant libraries. +using AbstractMCMC: AbstractMCMC +using Distributions +using Random +``` + +An interface extension (like the one we're writing right now) typically requires that you overload or implement several functions. Specifically, you should `import` the functions you intend to overload. This next code block accomplishes that. + +From `Distributions`, we need `Sampleable`, `VariateForm`, and `ValueSupport`, three abstract types that define a distribution. Models in the interface are assumed to be subtypes of `Sampleable{VariateForm, ValueSupport}`. In this section our model is going be be extremely simple, so we will not end up using these except to make sure that the inference functions are dispatching correctly. + +### Sampler + +Let's begin our sampler definition by defining a sampler called `MetropolisHastings` which is a subtype of `AbstractSampler`. Correct typing is very important for proper interface implementation -- if you are missing a subtype, your method may not be dispatched to when you call `sample`. + +```{julia} +# Define a sampler type. +struct MetropolisHastings{T,D} <: AbstractMCMC.AbstractSampler + init_θ::T + proposal::D +end + +# Default constructors. +MetropolisHastings(init_θ::Real) = MetropolisHastings(init_θ, Normal(0, 1)) +function MetropolisHastings(init_θ::Vector{<:Real}) + return MetropolisHastings(init_θ, MvNormal(zero(init_θ), I)) +end +``` + +Above, we have defined a sampler that stores the initial parameterization of the prior, and a distribution object from which proposals are drawn. You can have a struct that has no fields, and simply use it for dispatching onto the relevant functions, or you can store a large amount of state information in your sampler. + +The general intuition for what to store in your sampler struct is that anything you may need to perform inference between samples but you don't want to store in a transition should go into the sampler struct. It's the only way you can carry non-sample related state information between `step!` calls. + +### Model + +Next, we need to have a model of some kind. A model is a struct that's a subtype of `AbstractModel` that contains whatever information is necessary to perform inference on your problem. In our case we want to know the mean and variance parameters for a standard Normal distribution, so we can keep our model to the log density of a Normal. + +Note that we only have to do this because we are not yet integrating the sampler with Turing -- Turing has a very sophisticated modelling engine that removes the need to define custom model structs. + +```{julia} +# Define a model type. Stores the log density function. +struct DensityModel{F<:Function} <: AbstractMCMC.AbstractModel + ℓπ::F +end +``` + +### Transition + +The next step is to define some transition which we will return from each `step!` call. We'll keep it simple by just defining a wrapper struct that contains the parameter draws and the log density of that draw: + +```{julia} +# Create a very basic Transition type, only stores the +# parameter draws and the log probability of the draw. +struct Transition{T,L} + θ::T + lp::L +end + +# Store the new draw and its log density. +Transition(model::DensityModel, θ) = Transition(θ, ℓπ(model, θ)) +``` + +`Transition` can now store any type of parameter, whether it's a vector of draws from multiple parameters or a single univariate draw. + +### Metropolis-Hastings + +Now it's time to get into the actual inference. We've defined all of the core pieces we need, but we need to implement the `step!` function which actually performs inference. + +As a refresher, Metropolis-Hastings implements a very basic algorithm: + +1. Pick some initial state, ``\theta_0``. + +2. For ``t`` in ``[1,N],`` do + + + Generate a proposal parameterization ``\theta^\prime_t \sim q(\theta^\prime_t \mid \theta_{t-1}).`` + + + Calculate the acceptance probability, ``\alpha = \text{min}\left[1,\frac{\pi(\theta'_t)}{\pi(\theta_{t-1})} \frac{q(\theta_{t-1} \mid \theta'_t)}{q(\theta'_t \mid \theta_{t-1})}) \right].`` + + + If ``U \le \alpha`` where ``U \sim [0,1],`` then ``\theta_t = \theta'_t.`` Otherwise, ``\theta_t = \theta_{t-1}.`` + +Of course, it's much easier to do this in the log space, so the acceptance probability is more commonly written as + +```{.cell-bg} +\log \alpha = \min\left[0, \log \pi(\theta'_t) - \log \pi(\theta_{t-1}) + \log q(\theta_{t-1} \mid \theta^\prime_t) - \log q(\theta\prime_t \mid \theta_{t-1}) \right]. +``` + +In interface terms, we should do the following: + +1. Make a new transition containing a proposed sample. +2. Calculate the acceptance probability. +3. If we accept, return the new transition, otherwise, return the old one. + +### Steps + +The `step!` function is the function that performs the bulk of your inference. In our case, we will implement two `step!` functions -- one for the very first iteration, and one for every subsequent iteration. + +```{julia} +#| eval: false +# Define the first step! function, which is called at the +# beginning of sampling. Return the initial parameter used +# to define the sampler. +function AbstractMCMC.step!( + rng::AbstractRNG, + model::DensityModel, + spl::MetropolisHastings, + N::Integer, + ::Nothing; + kwargs..., +) + return Transition(model, spl.init_θ) +end +``` + +The first `step!` function just packages up the initial parameterization inside the sampler, and returns it. We implicitly accept the very first parameterization. + +The other `step!` function performs the usual steps from Metropolis-Hastings. Included are several helper functions, `proposal` and `q`, which are designed to replicate the functions in the pseudocode above. + +- `proposal` generates a new proposal in the form of a `Transition`, which can be univariate if the value passed in is univariate, or it can be multivariate if the `Transition` given is multivariate. Proposals use a basic `Normal` or `MvNormal` proposal distribution. +- `q` returns the log density of one parameterization conditional on another, according to the proposal distribution. +- `step!` generates a new proposal, checks the acceptance probability, and then returns either the previous transition or the proposed transition. + + +```{julia} +#| eval: false +# Define a function that makes a basic proposal depending on a univariate +# parameterization or a multivariate parameterization. +function propose(spl::MetropolisHastings, model::DensityModel, θ::Real) + return Transition(model, θ + rand(spl.proposal)) +end +function propose(spl::MetropolisHastings, model::DensityModel, θ::Vector{<:Real}) + return Transition(model, θ + rand(spl.proposal)) +end +function propose(spl::MetropolisHastings, model::DensityModel, t::Transition) + return propose(spl, model, t.θ) +end + +# Calculates the probability `q(θ|θcond)`, using the proposal distribution `spl.proposal`. +q(spl::MetropolisHastings, θ::Real, θcond::Real) = logpdf(spl.proposal, θ - θcond) +function q(spl::MetropolisHastings, θ::Vector{<:Real}, θcond::Vector{<:Real}) + return logpdf(spl.proposal, θ - θcond) +end +q(spl::MetropolisHastings, t1::Transition, t2::Transition) = q(spl, t1.θ, t2.θ) + +# Calculate the density of the model given some parameterization. +ℓπ(model::DensityModel, θ) = model.ℓπ(θ) +ℓπ(model::DensityModel, t::Transition) = t.lp + +# Define the other step function. Returns a Transition containing +# either a new proposal (if accepted) or the previous proposal +# (if not accepted). +function AbstractMCMC.step!( + rng::AbstractRNG, + model::DensityModel, + spl::MetropolisHastings, + ::Integer, + θ_prev::Transition; + kwargs..., +) + # Generate a new proposal. + θ = propose(spl, model, θ_prev) + + # Calculate the log acceptance probability. + α = ℓπ(model, θ) - ℓπ(model, θ_prev) + q(spl, θ_prev, θ) - q(spl, θ, θ_prev) + + # Decide whether to return the previous θ or the new one. + if log(rand(rng)) < min(α, 0.0) + return θ + else + return θ_prev + end +end +``` + +### Chains + +In the default implementation, `sample` just returns a vector of all transitions. If instead you would like to obtain a `Chains` object (e.g., to simplify downstream analysis), you have to implement the `bundle_samples` function as well. It accepts the vector of transitions and returns a collection of samples. Fortunately, our `Transition` is incredibly simple, and we only need to build a little bit of functionality to accept custom parameter names passed in by the user. + +```{julia} +#| eval: false +# A basic chains constructor that works with the Transition struct we defined. +function AbstractMCMC.bundle_samples( + rng::AbstractRNG, + ℓ::DensityModel, + s::MetropolisHastings, + N::Integer, + ts::Vector{<:Transition}, + chain_type::Type{Any}; + param_names=missing, + kwargs..., +) + # Turn all the transitions into a vector-of-vectors. + vals = copy(reduce(hcat, [vcat(t.θ, t.lp) for t in ts])') + + # Check if we received any parameter names. + if ismissing(param_names) + param_names = ["Parameter $i" for i in 1:(length(first(vals)) - 1)] + end + + # Add the log density field to the parameter names. + push!(param_names, "lp") + + # Bundle everything up and return a Chains struct. + return Chains(vals, param_names, (internals=["lp"],)) +end +``` + +All done! + +You can even implement different output formats by implementing `bundle_samples` for different `chain_type`s, which can be provided as keyword argument to `sample`. As default `sample` uses `chain_type = Any`. + +### Testing the implementation + +Now that we have all the pieces, we should test the implementation by defining a model to calculate the mean and variance parameters of a Normal distribution. We can do this by constructing a target density function, providing a sample of data, and then running the sampler with `sample`. + +```{julia} +#| eval: false +# Generate a set of data from the posterior we want to estimate. +data = rand(Normal(5, 3), 30) + +# Define the components of a basic model. +insupport(θ) = θ[2] >= 0 +dist(θ) = Normal(θ[1], θ[2]) +density(θ) = insupport(θ) ? sum(logpdf.(dist(θ), data)) : -Inf + +# Construct a DensityModel. +model = DensityModel(density) + +# Set up our sampler with initial parameters. +spl = MetropolisHastings([0.0, 0.0]) + +# Sample from the posterior. +chain = sample(model, spl, 100000; param_names=["μ", "σ"]) +``` + +If all the interface functions have been extended properly, you should get an output from `display(chain)` that looks something like this: + + +```{.cell-bg} +Object of type Chains, with data of type 100000×3×1 Array{Float64,3} + +Iterations = 1:100000 +Thinning interval = 1 +Chains = 1 +Samples per chain = 100000 +internals = lp +parameters = μ, σ + +2-element Array{ChainDataFrame,1} + +Summary Statistics + +│ Row │ parameters │ mean │ std │ naive_se │ mcse │ ess │ r_hat │ +│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Any │ Any │ +├─────┼────────────┼─────────┼──────────┼────────────┼────────────┼─────────┼─────────┤ +│ 1 │ μ │ 5.33157 │ 0.854193 │ 0.0027012 │ 0.00893069 │ 8344.75 │ 1.00009 │ +│ 2 │ σ │ 4.54992 │ 0.632916 │ 0.00200146 │ 0.00534942 │ 14260.8 │ 1.00005 │ + +Quantiles + +│ Row │ parameters │ 2.5% │ 25.0% │ 50.0% │ 75.0% │ 97.5% │ +│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ +├─────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ +│ 1 │ μ │ 3.6595 │ 4.77754 │ 5.33182 │ 5.89509 │ 6.99651 │ +│ 2 │ σ │ 3.5097 │ 4.09732 │ 4.47805 │ 4.93094 │ 5.96821 │ +``` + +It looks like we're extremely close to our true parameters of `Normal(5,3)`, though with a fairly high variance due to the low sample size. + +## Conclusion + +We've seen how to implement the sampling interface for general projects. Turing's interface methods are ever-evolving, so please open an issue at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) with feature requests or problems. diff --git a/tutorials/docs-04-for-developers-abstractmcmc-turing/index.qmd b/developers/inference/abstractmcmc-turing/index.qmd similarity index 97% rename from tutorials/docs-04-for-developers-abstractmcmc-turing/index.qmd rename to developers/inference/abstractmcmc-turing/index.qmd index b0723c5f2..6d313f232 100755 --- a/tutorials/docs-04-for-developers-abstractmcmc-turing/index.qmd +++ b/developers/inference/abstractmcmc-turing/index.qmd @@ -1,327 +1,329 @@ ---- -title: How Turing implements AbstractMCMC -engine: julia ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -Prerequisite: [Interface guide]({{}}). - -## Introduction - -Consider the following Turing, code block: - -```{julia} -using Turing - -@model function gdemo(x, y) - s² ~ InverseGamma(2, 3) - m ~ Normal(0, sqrt(s²)) - x ~ Normal(m, sqrt(s²)) - return y ~ Normal(m, sqrt(s²)) -end - -mod = gdemo(1.5, 2) -alg = IS() -n_samples = 1000 - -chn = sample(mod, alg, n_samples, progress=false) -``` - -The function `sample` is part of the AbstractMCMC interface. As explained in the [interface guide]({{}}), building a sampling method that can be used by `sample` consists in overloading the structs and functions in `AbstractMCMC`. The interface guide also gives a standalone example of their implementation, [`AdvancedMH.jl`](). - -Turing sampling methods (most of which are written [here](https://github.com/TuringLang/Turing.jl/tree/master/src/mcmc)) also implement `AbstractMCMC`. Turing defines a particular architecture for `AbstractMCMC` implementations, that enables working with models defined by the `@model` macro, and uses DynamicPPL as a backend. The goal of this page is to describe this architecture, and how you would go about implementing your own sampling method in Turing, using Importance Sampling as an example. I don't go into all the details: for instance, I don't address selectors or parallelism. - -First, we explain how Importance Sampling works in the abstract. Consider the model defined in the first code block. Mathematically, it can be written: - -$$ -\begin{align*} -s &\sim \text{InverseGamma}(2, 3), \\ -m &\sim \text{Normal}(0, \sqrt{s}), \\ -x &\sim \text{Normal}(m, \sqrt{s}), \\ -y &\sim \text{Normal}(m, \sqrt{s}). -\end{align*} -$$ - -The **latent** variables are $s$ and $m$, the **observed** variables are $x$ and $y$. The model **joint** distribution $p(s,m,x,y)$ decomposes into the **prior** $p(s,m)$ and the **likelihood** $p(x,y \mid s,m).$ Since $x = 1.5$ and $y = 2$ are observed, the goal is to infer the **posterior** distribution $p(s,m \mid x,y).$ - -Importance Sampling produces independent samples $(s_i, m_i)$ from the prior distribution. It also outputs unnormalized weights - -$$ -w_i = \frac {p(x,y,s_i,m_i)} {p(s_i, m_i)} = p(x,y \mid s_i, m_i) -$$ - -such that the empirical distribution - -$$ -\frac{1}{N} \sum_{i =1}^N \frac {w_i} {\sum_{j=1}^N w_j} \delta_{(s_i, m_i)} -$$ - -is a good approximation of the posterior. - -## 1. Define a Sampler - -Recall the last line of the above code block: - -```{julia} -chn = sample(mod, alg, n_samples, progress=false) -``` - -Here `sample` takes as arguments a **model** `mod`, an **algorithm** `alg`, and a **number of samples** `n_samples`, and returns an instance `chn` of `Chains` which can be analysed using the functions in `MCMCChains`. - -### Models - -To define a **model**, you declare a joint distribution on variables in the `@model` macro, and specify which variables are observed and which should be inferred, as well as the value of the observed variables. Thus, when implementing Importance Sampling, - -```{julia} -mod = gdemo(1.5, 2) -``` - -creates an instance `mod` of the struct `Model`, which corresponds to the observations of a value of `1.5` for `x`, and a value of `2` for `y`. - -This is all handled by DynamicPPL, more specifically [here](https://github.com/TuringLang/DynamicPPL.jl/blob/master/src/model.jl). I will return to how models are used to inform sampling algorithms [below](#assumeobserve). - -### Algorithms - -An **algorithm** is just a sampling method: in Turing, it is a subtype of the abstract type `InferenceAlgorithm`. Defining an algorithm may require specifying a few high-level parameters. For example, "Hamiltonian Monte-Carlo" may be too vague, but "Hamiltonian Monte Carlo with 10 leapfrog steps per proposal and a stepsize of 0.01" is an algorithm. "Metropolis-Hastings" may be too vague, but "Metropolis-Hastings with proposal distribution `p`" is an algorithm. -Thus - -```{julia} -stepsize = 0.01 -L = 10 -alg = HMC(stepsize, L) -``` - -defines a Hamiltonian Monte-Carlo algorithm, an instance of `HMC`, which is a subtype of `InferenceAlgorithm`. - -In the case of Importance Sampling, there is no need to specify additional parameters: - -```{julia} -alg = IS() -``` - -defines an Importance Sampling algorithm, an instance of `IS`, a subtype of `InferenceAlgorithm`. - -When creating your own Turing sampling method, you must, therefore, build a subtype of `InferenceAlgorithm` corresponding to your method. - -### Samplers - -Samplers are **not** the same as algorithms. An algorithm is a generic sampling method, a sampler is an object that stores information about how algorithm and model interact during sampling, and is modified as sampling progresses. The `Sampler` struct is defined in DynamicPPL. - -Turing implements `AbstractMCMC`'s `AbstractSampler` with the `Sampler` struct defined in `DynamicPPL`. The most important attributes of an instance `spl` of `Sampler` are: - -- `spl.alg`: the sampling method used, an instance of a subtype of `InferenceAlgorithm` -- `spl.state`: information about the sampling process, see [below](#states) - -When you call `sample(mod, alg, n_samples)`, Turing first uses `model` and `alg` to build an instance `spl` of `Sampler` , then calls the native `AbstractMCMC` function `sample(mod, spl, n_samples)`. - -When you define your own Turing sampling method, you must therefore build: - -- a **sampler constructor** that uses a model and an algorithm to initialize an instance of `Sampler`. For Importance Sampling: - -```{julia} -#| eval: false -function Sampler(alg::IS, model::Model, s::Selector) - info = Dict{Symbol,Any}() - state = ISState(model) - return Sampler(alg, info, s, state) -end -``` - -- a **state** struct implementing `AbstractSamplerState` corresponding to your method: we cover this in the following paragraph. - -### States - -The `vi` field contains all the important information about sampling: first and foremost, the values of all the samples, but also the distributions from which they are sampled, the names of model parameters, and other metadata. As we will see below, many important steps during sampling correspond to queries or updates to `spl.state.vi`. - -By default, you can use `SamplerState`, a concrete type defined in `inference/Inference.jl`, which extends `AbstractSamplerState` and has no field except for `vi`: - -```{julia} -#| eval: false -mutable struct SamplerState{VIType<:VarInfo} <: AbstractSamplerState - vi::VIType -end -``` - -When doing Importance Sampling, we care not only about the values of the samples but also their weights. We will see below that the weight of each sample is also added to `spl.state.vi`. Moreover, the average - -$$ -\frac 1 N \sum_{j=1}^N w_i = \frac 1 N \sum_{j=1}^N p(x,y \mid s_i, m_i) -$$ - -of the sample weights is a particularly important quantity: - -- it is used to **normalize** the **empirical approximation** of the posterior distribution -- its logarithm is the importance sampling **estimate** of the **log evidence** $\log p(x, y)$ - -To avoid having to compute it over and over again, `is.jl`defines an IS-specific concrete type `ISState` for sampler states, with an additional field `final_logevidence` containing - -$$ -\log \frac 1 N \sum_{j=1}^N w_i. -$$ - -```{julia} -#| eval: false -mutable struct ISState{V<:VarInfo,F<:AbstractFloat} <: AbstractSamplerState - vi::V - final_logevidence::F -end - -# additional constructor -ISState(model::Model) = ISState(VarInfo(model), 0.0) -``` - -The following diagram summarizes the hierarchy presented above. - -```{dot} -//| echo: false -digraph G { - node [shape=box]; - - spl [label=Sampler
<:AbstractSampler>, style=rounded, xlabel="", shape=box]; - state [label=State
<:AbstractSamplerState>, style=rounded, xlabel="", shape=box]; - alg [label=Algorithm
<:InferenceAlgorithm>, style=rounded, xlabel="", shape=box]; - vi [label=VarInfo
<:AbstractVarInfo>, style=rounded, xlabel="", shape=box]; - placeholder1 [label="...", width=1]; - placeholder2 [label="...", width=1]; - placeholder3 [label="...", width=1]; - placeholder4 [label="...", width=1]; - - spl -> state; - spl -> alg; - spl -> placeholder1; - - state -> vi; - state -> placeholder2; - - alg -> placeholder3; - placeholder1 -> placeholder4; -} -``` - -## 2. Overload the functions used inside mcmcsample - -A lot of the things here are method-specific. However, Turing also has some functions that make it easier for you to implement these functions, for example. - -### Transitions - -`AbstractMCMC` stores information corresponding to each individual sample in objects called `transition`, but does not specify what the structure of these objects could be. You could decide to implement a type `MyTransition` for transitions corresponding to the specifics of your methods. However, there are many situations in which the only information you need for each sample is: - -- its value: $\theta$ -- log of the joint probability of the observed data and this sample: `lp` - -`Inference.jl` [defines](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L103) a struct `Transition`, which corresponds to this default situation - -```{julia} -#| eval: false -struct Transition{T,F<:AbstractFloat} - θ::T - lp::F -end -``` - -It also [contains](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L108) a constructor that builds an instance of `Transition` from an instance `spl` of `Sampler`: $\theta$ is `spl.state.vi` converted to a `namedtuple`, and `lp` is `getlogp(spl.state.vi)`. `is.jl` uses this default constructor at the end of the `step!` function [here](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/is.jl#L58). - -### How `sample` works - -A crude summary, which ignores things like parallelism, is the following: - -`sample` calls `mcmcsample`, which calls - -- `sample_init!` to set things up -- `step!` repeatedly to produce multiple new transitions -- `sample_end!` to perform operations once all samples have been obtained -- `bundle_samples` to convert a vector of transitions into a more palatable type, for instance a `Chain`. - -You can, of course, implement all of these functions, but `AbstractMCMC` as well as Turing, also provide default implementations for simple cases. For instance, importance sampling uses the default implementations of `sample_init!` and `bundle_samples`, which is why you don't see code for them inside `is.jl`. - -## 3. Overload assume and observe - -The functions mentioned above, such as `sample_init!`, `step!`, etc., must, of course, use information about the model in order to generate samples! In particular, these functions may need **samples from distributions** defined in the model or to **evaluate the density of these distributions** at some values of the corresponding parameters or observations. - -For an example of the former, consider **Importance Sampling** as defined in `is.jl`. This implementation of Importance Sampling uses the model prior distribution as a proposal distribution, and therefore requires **samples from the prior distribution** of the model. Another example is **Approximate Bayesian Computation**, which requires multiple **samples from the model prior and likelihood distributions** in order to generate a single sample. - -An example of the latter is the **Metropolis-Hastings** algorithm. At every step of sampling from a target posterior - -$$ -p(\theta \mid x_{\text{obs}}), -$$ - -in order to compute the acceptance ratio, you need to **evaluate the model joint density** - -$$ -p\left(\theta_{\text{prop}}, x_{\text{obs}}\right) -$$ - -with $\theta_{\text{prop}}$ a sample from the proposal and $x_{\text{obs}}$ the observed data. - -This begs the question: how can these functions access model information during sampling? Recall that the model is stored as an instance `m` of `Model`. One of the attributes of `m` is the model evaluation function `m.f`, which is built by compiling the `@model` macro. Executing `f` runs the tilde statements of the model in order, and adds model information to the sampler (the instance of `Sampler` that stores information about the ongoing sampling process) at each step (see [here](https://turinglang.org/dev/docs/for-developers/compiler) for more information about how the `@model` macro is compiled). The DynamicPPL functions `assume` and `observe` determine what kind of information to add to the sampler for every tilde statement. - -Consider an instance `m` of `Model` and a sampler `spl`, with associated `VarInfo` `vi = spl.state.vi`. At some point during the sampling process, an AbstractMCMC function such as `step!` calls `m(vi, ...)`, which calls the model evaluation function `m.f(vi, ...)`. - - - for every tilde statement in the `@model` macro, `m.f(vi, ...)` returns model-related information (samples, value of the model density, etc.), and adds it to `vi`. How does it do that? - - + recall that the code for `m.f(vi, ...)` is automatically generated by compilation of the `@model` macro - - + for every tilde statement in the `@model` declaration, this code contains a call to `assume(vi, ...)` if the variable on the LHS of the tilde is a **model parameter to infer**, and `observe(vi, ...)` if the variable on the LHS of the tilde is an **observation** - - + in the file corresponding to your sampling method (ie in `Turing.jl/src/inference/.jl`), you have **overloaded** `assume` and `observe`, so that they can modify `vi` to include the information and samples that you care about! - - + at a minimum, `assume` and `observe` return the log density `lp` of the sample or observation. the model evaluation function then immediately calls `acclogp!!(vi, lp)`, which adds `lp` to the value of the log joint density stored in `vi`. - -Here's what `assume` looks like for Importance Sampling: - -```{julia} -#| eval: false -function DynamicPPL.assume(rng, spl::Sampler{<:IS}, dist::Distribution, vn::VarName, vi) - r = rand(rng, dist) - push!(vi, vn, r, dist, spl) - return r, 0 -end -``` - -The function first generates a sample `r` from the distribution `dist` (the right hand side of the tilde statement). It then adds `r` to `vi`, and returns `r` and 0. - -The `observe` function is even simpler: - -```{julia} -#| eval: false -function DynamicPPL.observe(spl::Sampler{<:IS}, dist::Distribution, value, vi) - return logpdf(dist, value) -end -``` - -It simply returns the density (in the discrete case, the probability) of the observed value under the distribution `dist`. - -## 4. Summary: Importance Sampling step by step - -We focus on the AbstractMCMC functions that are overridden in `is.jl` and executed inside `mcmcsample`: `step!`, which is called `n_samples` times, and `sample_end!`, which is executed once after those `n_samples` iterations. - - - During the $i$-th iteration, `step!` does 3 things: - - + `empty!!(spl.state.vi)`: remove information about the previous sample from the sampler's `VarInfo` - - + `model(rng, spl.state.vi, spl)`: call the model evaluation function - - * calls to `assume` add the samples from the prior $s_i$ and $m_i$ to `spl.state.vi` - - * calls to `assume` or `observe` are followed by the line `acclogp!!(vi, lp)`, where `lp` is an output of `assume` and `observe` - - * `lp` is set to 0 after `assume`, and to the value of the density at the observation after `observe` - - * When all the tilde statements have been covered, `spl.state.vi.logp[]` is the sum of the `lp`, i.e., the likelihood $\log p(x, y \mid s_i, m_i) = \log p(x \mid s_i, m_i) + \log p(y \mid s_i, m_i)$ of the observations given the latent variable samples $s_i$ and $m_i$. - - + `return Transition(spl)`: build a transition from the sampler, and return that transition - - * the transition's `vi` field is simply `spl.state.vi` - - * the `lp` field contains the likelihood `spl.state.vi.logp[]` - - - When the `n_samples` iterations are completed, `sample_end!` fills the `final_logevidence` field of `spl.state` - - + It simply takes the logarithm of the average of the sample weights, using the log weights for numerical stability +--- +title: How Turing Implements AbstractMCMC +engine: julia +aliases: + - ../../tutorials/docs-04-for-developers-abstractmcmc-turing/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +Prerequisite: [Interface guide]({{}}). + +## Introduction + +Consider the following Turing, code block: + +```{julia} +using Turing + +@model function gdemo(x, y) + s² ~ InverseGamma(2, 3) + m ~ Normal(0, sqrt(s²)) + x ~ Normal(m, sqrt(s²)) + return y ~ Normal(m, sqrt(s²)) +end + +mod = gdemo(1.5, 2) +alg = IS() +n_samples = 1000 + +chn = sample(mod, alg, n_samples, progress=false) +``` + +The function `sample` is part of the AbstractMCMC interface. As explained in the [interface guide]({{}}), building a sampling method that can be used by `sample` consists in overloading the structs and functions in `AbstractMCMC`. The interface guide also gives a standalone example of their implementation, [`AdvancedMH.jl`](). + +Turing sampling methods (most of which are written [here](https://github.com/TuringLang/Turing.jl/tree/master/src/mcmc)) also implement `AbstractMCMC`. Turing defines a particular architecture for `AbstractMCMC` implementations, that enables working with models defined by the `@model` macro, and uses DynamicPPL as a backend. The goal of this page is to describe this architecture, and how you would go about implementing your own sampling method in Turing, using Importance Sampling as an example. I don't go into all the details: for instance, I don't address selectors or parallelism. + +First, we explain how Importance Sampling works in the abstract. Consider the model defined in the first code block. Mathematically, it can be written: + +$$ +\begin{align*} +s &\sim \text{InverseGamma}(2, 3), \\ +m &\sim \text{Normal}(0, \sqrt{s}), \\ +x &\sim \text{Normal}(m, \sqrt{s}), \\ +y &\sim \text{Normal}(m, \sqrt{s}). +\end{align*} +$$ + +The **latent** variables are $s$ and $m$, the **observed** variables are $x$ and $y$. The model **joint** distribution $p(s,m,x,y)$ decomposes into the **prior** $p(s,m)$ and the **likelihood** $p(x,y \mid s,m).$ Since $x = 1.5$ and $y = 2$ are observed, the goal is to infer the **posterior** distribution $p(s,m \mid x,y).$ + +Importance Sampling produces independent samples $(s_i, m_i)$ from the prior distribution. It also outputs unnormalized weights + +$$ +w_i = \frac {p(x,y,s_i,m_i)} {p(s_i, m_i)} = p(x,y \mid s_i, m_i) +$$ + +such that the empirical distribution + +$$ +\frac{1}{N} \sum_{i =1}^N \frac {w_i} {\sum_{j=1}^N w_j} \delta_{(s_i, m_i)} +$$ + +is a good approximation of the posterior. + +## 1. Define a Sampler + +Recall the last line of the above code block: + +```{julia} +chn = sample(mod, alg, n_samples, progress=false) +``` + +Here `sample` takes as arguments a **model** `mod`, an **algorithm** `alg`, and a **number of samples** `n_samples`, and returns an instance `chn` of `Chains` which can be analysed using the functions in `MCMCChains`. + +### Models + +To define a **model**, you declare a joint distribution on variables in the `@model` macro, and specify which variables are observed and which should be inferred, as well as the value of the observed variables. Thus, when implementing Importance Sampling, + +```{julia} +mod = gdemo(1.5, 2) +``` + +creates an instance `mod` of the struct `Model`, which corresponds to the observations of a value of `1.5` for `x`, and a value of `2` for `y`. + +This is all handled by DynamicPPL, more specifically [here](https://github.com/TuringLang/DynamicPPL.jl/blob/master/src/model.jl). I will return to how models are used to inform sampling algorithms [below](#assumeobserve). + +### Algorithms + +An **algorithm** is just a sampling method: in Turing, it is a subtype of the abstract type `InferenceAlgorithm`. Defining an algorithm may require specifying a few high-level parameters. For example, "Hamiltonian Monte-Carlo" may be too vague, but "Hamiltonian Monte Carlo with 10 leapfrog steps per proposal and a stepsize of 0.01" is an algorithm. "Metropolis-Hastings" may be too vague, but "Metropolis-Hastings with proposal distribution `p`" is an algorithm. +Thus + +```{julia} +stepsize = 0.01 +L = 10 +alg = HMC(stepsize, L) +``` + +defines a Hamiltonian Monte-Carlo algorithm, an instance of `HMC`, which is a subtype of `InferenceAlgorithm`. + +In the case of Importance Sampling, there is no need to specify additional parameters: + +```{julia} +alg = IS() +``` + +defines an Importance Sampling algorithm, an instance of `IS`, a subtype of `InferenceAlgorithm`. + +When creating your own Turing sampling method, you must, therefore, build a subtype of `InferenceAlgorithm` corresponding to your method. + +### Samplers + +Samplers are **not** the same as algorithms. An algorithm is a generic sampling method, a sampler is an object that stores information about how algorithm and model interact during sampling, and is modified as sampling progresses. The `Sampler` struct is defined in DynamicPPL. + +Turing implements `AbstractMCMC`'s `AbstractSampler` with the `Sampler` struct defined in `DynamicPPL`. The most important attributes of an instance `spl` of `Sampler` are: + +- `spl.alg`: the sampling method used, an instance of a subtype of `InferenceAlgorithm` +- `spl.state`: information about the sampling process, see [below](#states) + +When you call `sample(mod, alg, n_samples)`, Turing first uses `model` and `alg` to build an instance `spl` of `Sampler` , then calls the native `AbstractMCMC` function `sample(mod, spl, n_samples)`. + +When you define your own Turing sampling method, you must therefore build: + +- a **sampler constructor** that uses a model and an algorithm to initialize an instance of `Sampler`. For Importance Sampling: + +```{julia} +#| eval: false +function Sampler(alg::IS, model::Model, s::Selector) + info = Dict{Symbol,Any}() + state = ISState(model) + return Sampler(alg, info, s, state) +end +``` + +- a **state** struct implementing `AbstractSamplerState` corresponding to your method: we cover this in the following paragraph. + +### States + +The `vi` field contains all the important information about sampling: first and foremost, the values of all the samples, but also the distributions from which they are sampled, the names of model parameters, and other metadata. As we will see below, many important steps during sampling correspond to queries or updates to `spl.state.vi`. + +By default, you can use `SamplerState`, a concrete type defined in `inference/Inference.jl`, which extends `AbstractSamplerState` and has no field except for `vi`: + +```{julia} +#| eval: false +mutable struct SamplerState{VIType<:VarInfo} <: AbstractSamplerState + vi::VIType +end +``` + +When doing Importance Sampling, we care not only about the values of the samples but also their weights. We will see below that the weight of each sample is also added to `spl.state.vi`. Moreover, the average + +$$ +\frac 1 N \sum_{j=1}^N w_i = \frac 1 N \sum_{j=1}^N p(x,y \mid s_i, m_i) +$$ + +of the sample weights is a particularly important quantity: + +- it is used to **normalize** the **empirical approximation** of the posterior distribution +- its logarithm is the importance sampling **estimate** of the **log evidence** $\log p(x, y)$ + +To avoid having to compute it over and over again, `is.jl`defines an IS-specific concrete type `ISState` for sampler states, with an additional field `final_logevidence` containing + +$$ +\log \frac 1 N \sum_{j=1}^N w_i. +$$ + +```{julia} +#| eval: false +mutable struct ISState{V<:VarInfo,F<:AbstractFloat} <: AbstractSamplerState + vi::V + final_logevidence::F +end + +# additional constructor +ISState(model::Model) = ISState(VarInfo(model), 0.0) +``` + +The following diagram summarizes the hierarchy presented above. + +```{dot} +//| echo: false +digraph G { + node [shape=box]; + + spl [label=Sampler
<:AbstractSampler>, style=rounded, xlabel="", shape=box]; + state [label=State
<:AbstractSamplerState>, style=rounded, xlabel="", shape=box]; + alg [label=Algorithm
<:InferenceAlgorithm>, style=rounded, xlabel="", shape=box]; + vi [label=VarInfo
<:AbstractVarInfo>, style=rounded, xlabel="", shape=box]; + placeholder1 [label="...", width=1]; + placeholder2 [label="...", width=1]; + placeholder3 [label="...", width=1]; + placeholder4 [label="...", width=1]; + + spl -> state; + spl -> alg; + spl -> placeholder1; + + state -> vi; + state -> placeholder2; + + alg -> placeholder3; + placeholder1 -> placeholder4; +} +``` + +## 2. Overload the functions used inside mcmcsample + +A lot of the things here are method-specific. However, Turing also has some functions that make it easier for you to implement these functions, for example. + +### Transitions + +`AbstractMCMC` stores information corresponding to each individual sample in objects called `transition`, but does not specify what the structure of these objects could be. You could decide to implement a type `MyTransition` for transitions corresponding to the specifics of your methods. However, there are many situations in which the only information you need for each sample is: + +- its value: $\theta$ +- log of the joint probability of the observed data and this sample: `lp` + +`Inference.jl` [defines](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L103) a struct `Transition`, which corresponds to this default situation + +```{julia} +#| eval: false +struct Transition{T,F<:AbstractFloat} + θ::T + lp::F +end +``` + +It also [contains](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L108) a constructor that builds an instance of `Transition` from an instance `spl` of `Sampler`: $\theta$ is `spl.state.vi` converted to a `namedtuple`, and `lp` is `getlogp(spl.state.vi)`. `is.jl` uses this default constructor at the end of the `step!` function [here](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/is.jl#L58). + +### How `sample` works + +A crude summary, which ignores things like parallelism, is the following: + +`sample` calls `mcmcsample`, which calls + +- `sample_init!` to set things up +- `step!` repeatedly to produce multiple new transitions +- `sample_end!` to perform operations once all samples have been obtained +- `bundle_samples` to convert a vector of transitions into a more palatable type, for instance a `Chain`. + +You can, of course, implement all of these functions, but `AbstractMCMC` as well as Turing, also provide default implementations for simple cases. For instance, importance sampling uses the default implementations of `sample_init!` and `bundle_samples`, which is why you don't see code for them inside `is.jl`. + +## 3. Overload assume and observe + +The functions mentioned above, such as `sample_init!`, `step!`, etc., must, of course, use information about the model in order to generate samples! In particular, these functions may need **samples from distributions** defined in the model or to **evaluate the density of these distributions** at some values of the corresponding parameters or observations. + +For an example of the former, consider **Importance Sampling** as defined in `is.jl`. This implementation of Importance Sampling uses the model prior distribution as a proposal distribution, and therefore requires **samples from the prior distribution** of the model. Another example is **Approximate Bayesian Computation**, which requires multiple **samples from the model prior and likelihood distributions** in order to generate a single sample. + +An example of the latter is the **Metropolis-Hastings** algorithm. At every step of sampling from a target posterior + +$$ +p(\theta \mid x_{\text{obs}}), +$$ + +in order to compute the acceptance ratio, you need to **evaluate the model joint density** + +$$ +p\left(\theta_{\text{prop}}, x_{\text{obs}}\right) +$$ + +with $\theta_{\text{prop}}$ a sample from the proposal and $x_{\text{obs}}$ the observed data. + +This begs the question: how can these functions access model information during sampling? Recall that the model is stored as an instance `m` of `Model`. One of the attributes of `m` is the model evaluation function `m.f`, which is built by compiling the `@model` macro. Executing `f` runs the tilde statements of the model in order, and adds model information to the sampler (the instance of `Sampler` that stores information about the ongoing sampling process) at each step (see [here](https://turinglang.org/dev/docs/for-developers/compiler) for more information about how the `@model` macro is compiled). The DynamicPPL functions `assume` and `observe` determine what kind of information to add to the sampler for every tilde statement. + +Consider an instance `m` of `Model` and a sampler `spl`, with associated `VarInfo` `vi = spl.state.vi`. At some point during the sampling process, an AbstractMCMC function such as `step!` calls `m(vi, ...)`, which calls the model evaluation function `m.f(vi, ...)`. + + - for every tilde statement in the `@model` macro, `m.f(vi, ...)` returns model-related information (samples, value of the model density, etc.), and adds it to `vi`. How does it do that? + + + recall that the code for `m.f(vi, ...)` is automatically generated by compilation of the `@model` macro + + + for every tilde statement in the `@model` declaration, this code contains a call to `assume(vi, ...)` if the variable on the LHS of the tilde is a **model parameter to infer**, and `observe(vi, ...)` if the variable on the LHS of the tilde is an **observation** + + + in the file corresponding to your sampling method (ie in `Turing.jl/src/inference/.jl`), you have **overloaded** `assume` and `observe`, so that they can modify `vi` to include the information and samples that you care about! + + + at a minimum, `assume` and `observe` return the log density `lp` of the sample or observation. the model evaluation function then immediately calls `acclogp!!(vi, lp)`, which adds `lp` to the value of the log joint density stored in `vi`. + +Here's what `assume` looks like for Importance Sampling: + +```{julia} +#| eval: false +function DynamicPPL.assume(rng, spl::Sampler{<:IS}, dist::Distribution, vn::VarName, vi) + r = rand(rng, dist) + push!(vi, vn, r, dist, spl) + return r, 0 +end +``` + +The function first generates a sample `r` from the distribution `dist` (the right hand side of the tilde statement). It then adds `r` to `vi`, and returns `r` and 0. + +The `observe` function is even simpler: + +```{julia} +#| eval: false +function DynamicPPL.observe(spl::Sampler{<:IS}, dist::Distribution, value, vi) + return logpdf(dist, value) +end +``` + +It simply returns the density (in the discrete case, the probability) of the observed value under the distribution `dist`. + +## 4. Summary: Importance Sampling step by step + +We focus on the AbstractMCMC functions that are overridden in `is.jl` and executed inside `mcmcsample`: `step!`, which is called `n_samples` times, and `sample_end!`, which is executed once after those `n_samples` iterations. + + - During the $i$-th iteration, `step!` does 3 things: + + + `empty!!(spl.state.vi)`: remove information about the previous sample from the sampler's `VarInfo` + + + `model(rng, spl.state.vi, spl)`: call the model evaluation function + + * calls to `assume` add the samples from the prior $s_i$ and $m_i$ to `spl.state.vi` + + * calls to `assume` or `observe` are followed by the line `acclogp!!(vi, lp)`, where `lp` is an output of `assume` and `observe` + + * `lp` is set to 0 after `assume`, and to the value of the density at the observation after `observe` + + * When all the tilde statements have been covered, `spl.state.vi.logp[]` is the sum of the `lp`, i.e., the likelihood $\log p(x, y \mid s_i, m_i) = \log p(x \mid s_i, m_i) + \log p(y \mid s_i, m_i)$ of the observations given the latent variable samples $s_i$ and $m_i$. + + + `return Transition(spl)`: build a transition from the sampler, and return that transition + + * the transition's `vi` field is simply `spl.state.vi` + + * the `lp` field contains the likelihood `spl.state.vi.logp[]` + + - When the `n_samples` iterations are completed, `sample_end!` fills the `final_logevidence` field of `spl.state` + + + It simply takes the logarithm of the average of the sample weights, using the log weights for numerical stability diff --git a/tutorials/docs-17-implementing-samplers/index.qmd b/developers/inference/implementing-samplers/index.qmd similarity index 99% rename from tutorials/docs-17-implementing-samplers/index.qmd rename to developers/inference/implementing-samplers/index.qmd index 5c3aee618..9d69fbb80 100644 --- a/tutorials/docs-17-implementing-samplers/index.qmd +++ b/developers/inference/implementing-samplers/index.qmd @@ -1,8 +1,10 @@ --- -title: Implementing samplers +title: Implementing Samplers engine: julia julia: exeflags: ["--project=@.", "-t 4"] +aliases: + - ../../tutorials/docs-17-implementing-samplers/index.html --- ```{julia} diff --git a/tutorials/docs-07-for-developers-variational-inference/index.qmd b/developers/inference/variational-inference/index.qmd similarity index 98% rename from tutorials/docs-07-for-developers-variational-inference/index.qmd rename to developers/inference/variational-inference/index.qmd index 332e7c6ed..0965b07c7 100755 --- a/tutorials/docs-07-for-developers-variational-inference/index.qmd +++ b/developers/inference/variational-inference/index.qmd @@ -1,383 +1,385 @@ ---- -title: Variational Inference -engine: julia ---- - -# Overview - -In this post, we'll examine variational inference (VI), a family of approximate Bayesian inference methods. We will focus on one of the more standard VI methods, Automatic Differentiation Variational Inference (ADVI). - -Here, we'll examine the theory behind VI, but if you're interested in using ADVI in Turing, [check out this tutorial]({{}}). - -# Motivation - -In Bayesian inference, one usually specifies a model as follows: given data $\\{x_i\\}_{i = 1}^n$, - -::: {.column-page} -$$ -\begin{align*} - \text{prior:} \quad z &\sim p(z) \\ - \text{likelihood:} \quad x_i &\overset{\text{i.i.d.}}{\sim} p(x \mid z) \quad \text{where} \quad i = 1, \dots, n -\end{align*} -$$ -::: - -where $\overset{\text{i.i.d.}}{\sim}$ denotes that the samples are identically independently distributed. Our goal in Bayesian inference is then to find the _posterior_ - -::: {.column-page} -$$ -p(z \mid \\{ x\_i \\}\_{i = 1}^n) \propto p(z) \prod\_{i=1}^{n} p(x\_i \mid z). -$$ -::: - -In general, one cannot obtain a closed form expression for $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$, but one might still be able to _sample_ from $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ with guarantees of converging to the target posterior $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ as the number of samples go to $\infty$, e.g. MCMC. - -As you are hopefully already aware, Turing.jl provides many methods with asymptotic exactness guarantees that we can apply to such a problem! - -Unfortunately, these unbiased samplers can be prohibitively expensive to run. As the model $p$ increases in complexity, the convergence of these unbiased samplers can slow down dramatically. Still, in the _infinite_ limit, these methods should converge to the true posterior! But infinity is fairly large, like, _at least_ more than 12, so this might take a while. - -In such a case, it might be desirable to sacrifice some of these asymptotic guarantees and instead _approximate_ the posterior $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using some other model which we'll denote $q(z)$. - -There are multiple approaches to take in this case, one of which is **variational inference (VI)**. - -# Variational Inference (VI) - -In VI, we're looking to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n )$ using some _approximate_ or _variational_ posterior $q(z)$. - -To approximate something you need a notion of what "close" means. In the context of probability densities a standard such "measure" of closeness is the _Kullback-Leibler (KL) divergence_ , though this is far from the only one. The KL-divergence is defined between two densities $q(z)$ and $p(z \mid \\{ x_i \\}_{i = 1}^n)$ as - -::: {.column-page} -$$ -\begin{align*} - \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \int \log \left( \frac{q(z)}{\prod\_{i = 1}^n p(z \mid x\_i)} \right) q(z) \mathrm{d}{z} \\\\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) - \sum\_{i = 1}^n \log p(z \mid x\_i) \right] \\\\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(z \mid x\_i) \right]. -\end{align*} -$$ -::: - -It's worth noting that unfortunately the KL-divergence is _not_ a metric/distance in the analysis-sense due to its lack of symmetry. On the other hand, it turns out that minimizing the KL-divergence that it's actually equivalent to maximizing the log-likelihood! Also, under reasonable restrictions on the densities at hand, - -::: {.column-page} -$$ -\mathrm{D\_{KL}}\left(q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = 0 \quad \iff \quad q(z) = p(z \mid \\{ x\_i \\}\_{i = 1}^n), \quad \forall z. -$$ -::: - -Therefore one could (and we will) attempt to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using a density $q(z)$ by minimizing the KL-divergence between these two! - -One can also show that $\mathrm{D_{KL}} \ge 0$, which we'll need later. Finally notice that the KL-divergence is only well-defined when in fact $q(z)$ is zero everywhere $p(z \mid \\{ x_i \\}_{i = 1}^n)$ is zero, i.e. - -::: {.column-page} -$$ -\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right). -$$ -::: - -Otherwise, there might be a point $z_0 \sim q(z)$ such that $p(z_0 \mid \\{ x_i \\}_{i = 1}^n) = 0$, resulting in $\log\left(\frac{q(z)}{0}\right)$ which doesn't make sense! - -One major problem: as we can see in the definition of the KL-divergence, we need $p(z \mid \\{ x_i \\}_{i = 1}^n)$ for any $z$ if we want to compute the KL-divergence between this and $q(z)$. We don't have that. The entire reason we even do Bayesian inference is that we don't know the posterior! Cleary this isn't going to work. _Or is it?!_ - -## Computing KL-divergence without knowing the posterior - -First off, recall that - -::: {.column-page} -$$ -p(z \mid x\_i) = \frac{p(x\_i, z)}{p(x\_i)} -$$ -::: - -so we can write - -::: {.column-page} -$$ -\begin{align*} -\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) - \log p(x\_i) \right] \\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x_i) \right] \\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \log p(x\_i), -\end{align*} -$$ -::: - -where in the last equality we used the fact that $p(x_i)$ is independent of $z$. - -Now you're probably thinking "Oh great! Now you've introduced $p(x_i)$ which we _also_ can't compute (in general)!". Woah. Calm down human. Let's do some more algebra. The above expression can be rearranged to - -::: {.column-page} -$$ -\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) + \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right]}\_{=: \mathrm{ELBO}(q)} = \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i) \right]}\_{\text{constant}}. -$$ -::: - -See? The left-hand side is _constant_ and, as we mentioned before, $\mathrm{D_{KL}} \ge 0$. What happens if we try to _maximize_ the term we just gave the completely arbitrary name $\mathrm{ELBO}$? Well, if $\mathrm{ELBO}$ goes up while $p(x_i)$ stays constant then $\mathrm{D_{KL}}$ _has to_ go down! That is, the $q(z)$ which _minimizes_ the KL-divergence is the same $q(z)$ which _maximizes_ $\mathrm{ELBO}(q)$: - -::: {.column-page} -$$ -\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = \underset{q}{\mathrm{argmax}} \ \mathrm{ELBO}(q) -$$ -::: - -where - -::: {.column-page} -$$ -\begin{align*} -\mathrm{ELBO}(q) &:= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] \\ - &= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) + \mathbb{H}\left( q(z) \right) -\end{align*} -$$ -::: - -and $\mathbb{H} \left(q(z) \right)$ denotes the [(differential) entropy](https://www.wikiwand.com/en/Differential_entropy) of $q(z)$. - -Assuming joint $p(x_i, z)$ and the entropy $\mathbb{H}\left(q(z)\right)$ are both tractable, we can use a Monte-Carlo for the remaining expectation. This leaves us with the following tractable expression - -::: {.column-page} -$$ -\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) \approx \underset{q}{\mathrm{argmax}} \ \widehat{\mathrm{ELBO}}(q) -$$ -::: - -where - -::: {.column-page} -$$ -\widehat{\mathrm{ELBO}}(q) = \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \log p(x\_i, z\_k) \right) + \mathbb{H} \left(q(z)\right) \quad \text{where} \quad z\_k \sim q(z) \quad \forall k = 1, \dots, m. -$$ -::: - -Hence, as long as we can sample from $q(z)$ somewhat efficiently, we can indeed minimize the KL-divergence! Neat, eh? - -Sidenote: in the case where $q(z)$ is tractable but $\mathbb{H} \left(q(z) \right)$ is _not_ , we can use an Monte-Carlo estimate for this term too but this generally results in a higher-variance estimate. - -Also, I fooled you real good: the ELBO _isn't_ an arbitrary name, hah! In fact it's an abbreviation for the **expected lower bound (ELBO)** because it, uhmm, well, it's the _expected_ lower bound (remember $\mathrm{D_{KL}} \ge 0$). Yup. - -## Maximizing the ELBO - -Finding the optimal $q$ over _all_ possible densities of course isn't feasible. Instead we consider a family of _parameterized_ densities $\mathscr{D}\_{\Theta}$ where $\Theta$ denotes the space of possible parameters. Each density in this family $q\_{\theta} \in \mathscr{D}\_{\Theta}$ is parameterized by a unique $\theta \in \Theta$. Moreover, we'll assume - - 1. $q\_{\theta}(z)$, i.e. evaluating the probability density $q$ at any point $z$, is differentiable - 2. $z \sim q\_{\theta}(z)$, i.e. the process of sampling from $q\_{\theta}(z)$, is differentiable - -(1) is fairly straight-forward, but (2) is a bit tricky. What does it even mean for a _sampling process_ to be differentiable? This is quite an interesting problem in its own right and would require something like a [50-page paper to properly review the different approaches (highly recommended read)](https://arxiv.org/abs/1906.10652). - -We're going to make use of a particular such approach which goes under a bunch of different names: _reparametrization trick_, _path derivative_, etc. This refers to making the assumption that all elements $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ can be considered as reparameterizations of some base density, say $\bar{q}(z)$. That is, if $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ then - -::: {.column-page} -$$ -z \sim q\_{\theta}(z) \quad \iff \quad z := g\_{\theta}(\tilde{z}) \quad \text{where} \quad \bar{z} \sim \bar{q}(z) -$$ -::: - -for some function $g\_{\theta}$ differentiable wrt. $\theta$. So all $q_{\theta} \in \mathscr{Q}\_{\Theta}$ are using the *same* reparameterization-function $g$ but each $q\_{\theta}$ correspond to different choices of $\theta$ for $f\_{\theta}$. - -Under this assumption we can differentiate the sampling process by taking the derivative of $g\_{\theta}$ wrt. $\theta$, and thus we can differentiate the entire $\widehat{\mathrm{ELBO}}(q\_{\theta})$ wrt. $\theta$! With the gradient available we can either try to solve for optimality either by setting the gradient equal to zero or maximize $\widehat{\mathrm{ELBO}}(q\_{\theta})$ stepwise by traversing $\mathscr{Q}\_{\Theta}$ in the direction of steepest ascent. For the sake of generality, we're going to go with the stepwise approach. - -With all this nailed down, we eventually reach the section on **Automatic Differentiation Variational Inference (ADVI)**. - -## Automatic Differentiation Variational Inference (ADVI) - -So let's revisit the assumptions we've made at this point: - - 1. The variational posterior $q\_{\theta}$ is in a parameterized family of densities denoted $\mathscr{Q}\_{\Theta}$, with $\theta \in \Theta$. - - 2. $\mathscr{Q}\_{\Theta}$ is a space of _reparameterizable_ densities with $\bar{q}(z)$ as the base-density. - - 3. The parameterization function $g\_{\theta}$ is differentiable wrt. $\theta$. - - 4. Evaluation of the probability density $q\_{\theta}(z)$ is differentiable wrt. $\theta$. - - 5. $\mathbb{H}\left(q\_{\theta}(z)\right)$ is tractable. - - 6. Evaluation of the joint density $p(x, z)$ is tractable and differentiable wrt. $z$ - - 7. The support of $q(z)$ is a subspace of the support of $p(z \mid x)$ : $\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right)$. - -All of these are not *necessary* to do VI, but they are very convenient and results in a fairly flexible approach. One distribution which has a density satisfying all of the above assumptions _except_ (7) (we'll get back to this in second) for any tractable and differentiable $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ is the good ole' Gaussian/normal distribution: - -::: {.column-page} -$$ -z \sim \mathcal{N}(\mu, \Sigma) \quad \iff \quad z = g\_{\mu, L}(\bar{z}) := \mu + L^T \tilde{z} \quad \text{where} \quad \bar{z} \sim \bar{q}(z) := \mathcal{N}(1\_d, I\_{d \times d}) -$$ -::: - -where $\Sigma = L L^T,$ with $L$ obtained from the Cholesky-decomposition. Abusing notation a bit, we're going to write - -::: {.column-page} -$$ -\theta = (\mu, \Sigma) := (\mu\_1, \dots, \mu\_d, L\_{11}, \dots, L\_{1, d}, L\_{2, 1}, \dots, L\_{2, d}, \dots, L\_{d, 1}, \dots, L\_{d, d}). -$$ -::: - -With this assumption we finally have a tractable expression for $\widehat{\mathrm{ELBO}}(q_{\mu, \Sigma})$! Well, assuming (7) is holds. Since a Gaussian has non-zero probability on the entirety of $\mathbb{R}^d$, we also require $p(z \mid \\{ x_i \\}_{i = 1}^n)$ to have non-zero probability on all of $\mathbb{R}^d$. - -Though not necessary, we'll often make a *mean-field* assumption for the variational posterior $q(z)$, i.e. assume independence between the latent variables. In this case, we'll write - -::: {.column-page} -$$ -\theta = (\mu, \sigma^2) := (\mu\_1, \dots, \mu\_d, \sigma\_1^2, \dots, \sigma\_d^2). -$$ -::: - -### Examples - -As a (trivial) example we could apply the approach described above to is the following generative model for $p(z \mid \\{ x_i \\}\_{i = 1}^n)$: - -::: {.column-page} -$$ -\begin{align*} - m &\sim \mathcal{N}(0, 1) \\ - x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, 1), \quad i = 1, \dots, n. -\end{align*} -$$ -::: - -In this case $z = m$ and we have the posterior defined $p(m \mid \\{ x\_i \\}\_{i = 1}^n) = p(m) \prod\_{i = 1}^n p(x\_i \mid m)$. Then the variational posterior would be - -::: {.column-page} -$$ -q\_{\mu, \sigma} = \mathcal{N}(\mu, \sigma^2), \quad \text{where} \quad \mu \in \mathbb{R}, \ \sigma^2 \in \mathbb{R}^{ + }. -$$ -::: - -And since prior of $m$, $\mathcal{N}(0, 1)$, has non-zero probability on the entirety of $\mathbb{R}$, same as $q(m)$, i.e. assumption (7) above holds, everything is fine and life is good. - -But what about this generative model for $p(z \mid \\{ x_i \\}_{i = 1}^n)$: - -::: {.column-page} -$$ -\begin{align*} - s &\sim \mathrm{InverseGamma}(2, 3), \\ - m &\sim \mathcal{N}(0, s), \\ - x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, s), \quad i = 1, \dots, n, -\end{align*} -$$ -::: - -with posterior $p(s, m \mid \\{ x\_i \\}\_{i = 1}^n) = p(s) p(m \mid s) \prod\_{i = 1}^n p(x\_i \mid s, m)$ and the mean-field variational posterior $q(s, m)$ will be - -::: {.column-page} -$$ -q\_{\mu\_1, \mu\_2, \sigma\_1^2, \sigma\_2^2}(s, m) = p\_{\mathcal{N}(\mu\_1, \sigma\_1^2)}(s)\ p\_{\mathcal{N}(\mu\_2, \sigma\_2^2)}(m), -$$ -::: - -where we've denoted the evaluation of the probability density of a Gaussian as $p_{\mathcal{N}(\mu, \sigma^2)}(x)$. - -Observe that $\mathrm{InverseGamma}(2, 3)$ has non-zero probability only on $\mathbb{R}^{ + } := (0, \infty)$ which is clearly not all of $\mathbb{R}$ like $q(s, m)$ has, i.e. - -::: {.column-page} -$$ -\mathrm{supp} \left( q(s, m) \right) \not\subseteq \mathrm{supp} \left( p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right). -$$ -::: - -Recall from the definition of the KL-divergence that when this is the case, the KL-divergence isn't well defined. This gets us to the *automatic* part of ADVI. - -### "Automatic"? How? - -For a lot of the standard (continuous) densities $p$ we can actually construct a probability density $\tilde{p}$ with non-zero probability on all of $\mathbb{R}$ by *transforming* the "constrained" probability density $p$ to $\tilde{p}$. In fact, in these cases this is a one-to-one relationship. As we'll see, this helps solve the support-issue we've been going on and on about. - -#### Transforming densities using change of variables - -If we want to compute the probability of $x$ taking a value in some set $A \subseteq \mathrm{supp} \left( p(x) \right)$, we have to integrate $p(x)$ over $A$, i.e. - -::: {.column-page} -$$ -\mathbb{P}_p(x \in A) = \int_A p(x) \mathrm{d}x. -$$ -::: - -This means that if we have a differentiable bijection $f: \mathrm{supp} \left( q(x) \right) \to \mathbb{R}^d$ with differentiable inverse $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(x) \right)$, we can perform a change of variables - -::: {.column-page} -$$ -\mathbb{P}\_p(x \in A) = \int\_{f^{-1}(A)} p \left(f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| \mathrm{d}y, -$$ -::: - -where $\mathcal{J}_{f^{-1}}(x)$ denotes the jacobian of $f^{-1}$ evaluated at $x$. Observe that this defines a probability distribution - -::: {.column-page} -$$ -\mathbb{P}\_{\tilde{p}}\left(y \in f^{-1}(A) \right) = \int\_{f^{-1}(A)} \tilde{p}(y) \mathrm{d}y, -$$ -::: - -since $f^{-1}\left(\mathrm{supp} (p(x)) \right) = \mathbb{R}^d$ which has probability 1. This probability distribution has *density* $\tilde{p}(y)$ with $\mathrm{supp} \left( \tilde{p}(y) \right) = \mathbb{R}^d$, defined - -::: {.column-page} -$$ -\tilde{p}(y) = p \left( f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| -$$ -::: - -or equivalently - -::: {.column-page} -$$ -\tilde{p} \left( f(x) \right) = \frac{p(x)}{\big| \det \mathcal{J}\_{f}(x) \big|} -$$ -::: - -due to the fact that - -::: {.column-page} -$$ -\big| \det \mathcal{J}\_{f^{-1}}(y) \big| = \big| \det \mathcal{J}\_{f}(x) \big|^{-1} -$$ -::: - -*Note: it's also necessary that the log-abs-det-jacobian term is non-vanishing. This can for example be accomplished by assuming $f$ to also be elementwise monotonic.* - -#### Back to VI - -So why is this is useful? Well, we're looking to generalize our approach using a normal distribution to cases where the supports don't match up. How about defining $q(z)$ by - -::: {.column-page} -$$ -\begin{align*} - \eta &\sim \mathcal{N}(\mu, \Sigma), \\\\ - z &= f^{-1}(\eta), -\end{align*} -$$ -::: - -where $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(z \mid x) \right)$ is a differentiable bijection with differentiable inverse. Then $z \sim q_{\mu, \Sigma}(z) \implies z \in \mathrm{supp} \left( p(z \mid x) \right)$ as we wanted. The resulting variational density is - -::: {.column-page} -$$ -q\_{\mu, \Sigma}(z) = p\_{\mathcal{N}(\mu, \Sigma)}\left( f(z) \right) \ \big| \det \mathcal{J}\_{f}(z) \big|. -$$ -::: - -Note that the way we've constructed $q(z)$ here is basically a reverse of the approach we described above. Here we sample from a distribution with support on $\mathbb{R}$ and transform *to* $\mathrm{supp} \left( p(z \mid x) \right)$. - -If we want to write the ELBO explicitly in terms of $\eta$ rather than $z$, the first term in the ELBO becomes - -::: {.column-page} -$$ -\begin{align*} - \mathbb{E}\_{z \sim q_{\mu, \Sigma}(z)} \left[ \log p(x\_i, z) \right] &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \Bigg[ \log \frac{p\left(x\_i, f^{-1}(\eta) \right)}{\big| \det \mathcal{J}_{f^{-1}}(\eta) \big|} \Bigg] \\ - &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \log p\left(x\_i, f^{-1}(\eta) \right) \right] - \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \left| \det \mathcal{J}\_{f^{-1}}(\eta) \right| \right]. -\end{align*} -$$ -::: - -The entropy is invariant under change of variables, thus $\mathbb{H} \left(q\_{\mu, \Sigma}(z)\right)$ is simply the entropy of the normal distribution which is known analytically. - -Hence, the resulting empirical estimate of the ELBO is - -::: {.column-page} -$$ -\begin{align*} -\widehat{\mathrm{ELBO}}(q\_{\mu, \Sigma}) &= \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \left(\log p\left(x\_i, f^{-1}(\eta_k)\right) - \log \big| \det \mathcal{J}\_{f^{-1}}(\eta\_k) \big| \right) \right) + \mathbb{H} \left(p\_{\mathcal{N}(\mu, \Sigma)}(z)\right) \\ -& \text{where} \quad z\_k \sim \mathcal{N}(\mu, \Sigma) \quad \forall k = 1, \dots, m -\end{align*}. -$$ -::: - -And maximizing this wrt. $\mu$ and $\Sigma$ is what's referred to as **Automatic Differentiation Variational Inference (ADVI)**! - -Now if you want to try it out, [check out the tutorial on how to use ADVI in Turing.jl]({{}})! +--- +title: Variational Inference +engine: julia +aliases: + - ../../tutorials/docs-07-for-developers-variational-inference/index.html +--- + +# Overview + +In this post, we'll examine variational inference (VI), a family of approximate Bayesian inference methods. We will focus on one of the more standard VI methods, Automatic Differentiation Variational Inference (ADVI). + +Here, we'll examine the theory behind VI, but if you're interested in using ADVI in Turing, [check out this tutorial]({{}}). + +# Motivation + +In Bayesian inference, one usually specifies a model as follows: given data $\\{x_i\\}_{i = 1}^n$, + +::: {.column-page} +$$ +\begin{align*} + \text{prior:} \quad z &\sim p(z) \\ + \text{likelihood:} \quad x_i &\overset{\text{i.i.d.}}{\sim} p(x \mid z) \quad \text{where} \quad i = 1, \dots, n +\end{align*} +$$ +::: + +where $\overset{\text{i.i.d.}}{\sim}$ denotes that the samples are identically independently distributed. Our goal in Bayesian inference is then to find the _posterior_ + +::: {.column-page} +$$ +p(z \mid \\{ x\_i \\}\_{i = 1}^n) \propto p(z) \prod\_{i=1}^{n} p(x\_i \mid z). +$$ +::: + +In general, one cannot obtain a closed form expression for $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$, but one might still be able to _sample_ from $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ with guarantees of converging to the target posterior $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ as the number of samples go to $\infty$, e.g. MCMC. + +As you are hopefully already aware, Turing.jl provides many methods with asymptotic exactness guarantees that we can apply to such a problem! + +Unfortunately, these unbiased samplers can be prohibitively expensive to run. As the model $p$ increases in complexity, the convergence of these unbiased samplers can slow down dramatically. Still, in the _infinite_ limit, these methods should converge to the true posterior! But infinity is fairly large, like, _at least_ more than 12, so this might take a while. + +In such a case, it might be desirable to sacrifice some of these asymptotic guarantees and instead _approximate_ the posterior $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using some other model which we'll denote $q(z)$. + +There are multiple approaches to take in this case, one of which is **variational inference (VI)**. + +# Variational Inference (VI) + +In VI, we're looking to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n )$ using some _approximate_ or _variational_ posterior $q(z)$. + +To approximate something you need a notion of what "close" means. In the context of probability densities a standard such "measure" of closeness is the _Kullback-Leibler (KL) divergence_ , though this is far from the only one. The KL-divergence is defined between two densities $q(z)$ and $p(z \mid \\{ x_i \\}_{i = 1}^n)$ as + +::: {.column-page} +$$ +\begin{align*} + \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \int \log \left( \frac{q(z)}{\prod\_{i = 1}^n p(z \mid x\_i)} \right) q(z) \mathrm{d}{z} \\\\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) - \sum\_{i = 1}^n \log p(z \mid x\_i) \right] \\\\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(z \mid x\_i) \right]. +\end{align*} +$$ +::: + +It's worth noting that unfortunately the KL-divergence is _not_ a metric/distance in the analysis-sense due to its lack of symmetry. On the other hand, it turns out that minimizing the KL-divergence that it's actually equivalent to maximizing the log-likelihood! Also, under reasonable restrictions on the densities at hand, + +::: {.column-page} +$$ +\mathrm{D\_{KL}}\left(q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = 0 \quad \iff \quad q(z) = p(z \mid \\{ x\_i \\}\_{i = 1}^n), \quad \forall z. +$$ +::: + +Therefore one could (and we will) attempt to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using a density $q(z)$ by minimizing the KL-divergence between these two! + +One can also show that $\mathrm{D_{KL}} \ge 0$, which we'll need later. Finally notice that the KL-divergence is only well-defined when in fact $q(z)$ is zero everywhere $p(z \mid \\{ x_i \\}_{i = 1}^n)$ is zero, i.e. + +::: {.column-page} +$$ +\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right). +$$ +::: + +Otherwise, there might be a point $z_0 \sim q(z)$ such that $p(z_0 \mid \\{ x_i \\}_{i = 1}^n) = 0$, resulting in $\log\left(\frac{q(z)}{0}\right)$ which doesn't make sense! + +One major problem: as we can see in the definition of the KL-divergence, we need $p(z \mid \\{ x_i \\}_{i = 1}^n)$ for any $z$ if we want to compute the KL-divergence between this and $q(z)$. We don't have that. The entire reason we even do Bayesian inference is that we don't know the posterior! Cleary this isn't going to work. _Or is it?!_ + +## Computing KL-divergence without knowing the posterior + +First off, recall that + +::: {.column-page} +$$ +p(z \mid x\_i) = \frac{p(x\_i, z)}{p(x\_i)} +$$ +::: + +so we can write + +::: {.column-page} +$$ +\begin{align*} +\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) - \log p(x\_i) \right] \\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x_i) \right] \\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \log p(x\_i), +\end{align*} +$$ +::: + +where in the last equality we used the fact that $p(x_i)$ is independent of $z$. + +Now you're probably thinking "Oh great! Now you've introduced $p(x_i)$ which we _also_ can't compute (in general)!". Woah. Calm down human. Let's do some more algebra. The above expression can be rearranged to + +::: {.column-page} +$$ +\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) + \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right]}\_{=: \mathrm{ELBO}(q)} = \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i) \right]}\_{\text{constant}}. +$$ +::: + +See? The left-hand side is _constant_ and, as we mentioned before, $\mathrm{D_{KL}} \ge 0$. What happens if we try to _maximize_ the term we just gave the completely arbitrary name $\mathrm{ELBO}$? Well, if $\mathrm{ELBO}$ goes up while $p(x_i)$ stays constant then $\mathrm{D_{KL}}$ _has to_ go down! That is, the $q(z)$ which _minimizes_ the KL-divergence is the same $q(z)$ which _maximizes_ $\mathrm{ELBO}(q)$: + +::: {.column-page} +$$ +\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = \underset{q}{\mathrm{argmax}} \ \mathrm{ELBO}(q) +$$ +::: + +where + +::: {.column-page} +$$ +\begin{align*} +\mathrm{ELBO}(q) &:= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] \\ + &= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) + \mathbb{H}\left( q(z) \right) +\end{align*} +$$ +::: + +and $\mathbb{H} \left(q(z) \right)$ denotes the [(differential) entropy](https://www.wikiwand.com/en/Differential_entropy) of $q(z)$. + +Assuming joint $p(x_i, z)$ and the entropy $\mathbb{H}\left(q(z)\right)$ are both tractable, we can use a Monte-Carlo for the remaining expectation. This leaves us with the following tractable expression + +::: {.column-page} +$$ +\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) \approx \underset{q}{\mathrm{argmax}} \ \widehat{\mathrm{ELBO}}(q) +$$ +::: + +where + +::: {.column-page} +$$ +\widehat{\mathrm{ELBO}}(q) = \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \log p(x\_i, z\_k) \right) + \mathbb{H} \left(q(z)\right) \quad \text{where} \quad z\_k \sim q(z) \quad \forall k = 1, \dots, m. +$$ +::: + +Hence, as long as we can sample from $q(z)$ somewhat efficiently, we can indeed minimize the KL-divergence! Neat, eh? + +Sidenote: in the case where $q(z)$ is tractable but $\mathbb{H} \left(q(z) \right)$ is _not_ , we can use an Monte-Carlo estimate for this term too but this generally results in a higher-variance estimate. + +Also, I fooled you real good: the ELBO _isn't_ an arbitrary name, hah! In fact it's an abbreviation for the **expected lower bound (ELBO)** because it, uhmm, well, it's the _expected_ lower bound (remember $\mathrm{D_{KL}} \ge 0$). Yup. + +## Maximizing the ELBO + +Finding the optimal $q$ over _all_ possible densities of course isn't feasible. Instead we consider a family of _parameterized_ densities $\mathscr{D}\_{\Theta}$ where $\Theta$ denotes the space of possible parameters. Each density in this family $q\_{\theta} \in \mathscr{D}\_{\Theta}$ is parameterized by a unique $\theta \in \Theta$. Moreover, we'll assume + + 1. $q\_{\theta}(z)$, i.e. evaluating the probability density $q$ at any point $z$, is differentiable + 2. $z \sim q\_{\theta}(z)$, i.e. the process of sampling from $q\_{\theta}(z)$, is differentiable + +(1) is fairly straight-forward, but (2) is a bit tricky. What does it even mean for a _sampling process_ to be differentiable? This is quite an interesting problem in its own right and would require something like a [50-page paper to properly review the different approaches (highly recommended read)](https://arxiv.org/abs/1906.10652). + +We're going to make use of a particular such approach which goes under a bunch of different names: _reparametrization trick_, _path derivative_, etc. This refers to making the assumption that all elements $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ can be considered as reparameterizations of some base density, say $\bar{q}(z)$. That is, if $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ then + +::: {.column-page} +$$ +z \sim q\_{\theta}(z) \quad \iff \quad z := g\_{\theta}(\tilde{z}) \quad \text{where} \quad \bar{z} \sim \bar{q}(z) +$$ +::: + +for some function $g\_{\theta}$ differentiable wrt. $\theta$. So all $q_{\theta} \in \mathscr{Q}\_{\Theta}$ are using the *same* reparameterization-function $g$ but each $q\_{\theta}$ correspond to different choices of $\theta$ for $f\_{\theta}$. + +Under this assumption we can differentiate the sampling process by taking the derivative of $g\_{\theta}$ wrt. $\theta$, and thus we can differentiate the entire $\widehat{\mathrm{ELBO}}(q\_{\theta})$ wrt. $\theta$! With the gradient available we can either try to solve for optimality either by setting the gradient equal to zero or maximize $\widehat{\mathrm{ELBO}}(q\_{\theta})$ stepwise by traversing $\mathscr{Q}\_{\Theta}$ in the direction of steepest ascent. For the sake of generality, we're going to go with the stepwise approach. + +With all this nailed down, we eventually reach the section on **Automatic Differentiation Variational Inference (ADVI)**. + +## Automatic Differentiation Variational Inference (ADVI) + +So let's revisit the assumptions we've made at this point: + + 1. The variational posterior $q\_{\theta}$ is in a parameterized family of densities denoted $\mathscr{Q}\_{\Theta}$, with $\theta \in \Theta$. + + 2. $\mathscr{Q}\_{\Theta}$ is a space of _reparameterizable_ densities with $\bar{q}(z)$ as the base-density. + + 3. The parameterization function $g\_{\theta}$ is differentiable wrt. $\theta$. + + 4. Evaluation of the probability density $q\_{\theta}(z)$ is differentiable wrt. $\theta$. + + 5. $\mathbb{H}\left(q\_{\theta}(z)\right)$ is tractable. + + 6. Evaluation of the joint density $p(x, z)$ is tractable and differentiable wrt. $z$ + + 7. The support of $q(z)$ is a subspace of the support of $p(z \mid x)$ : $\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right)$. + +All of these are not *necessary* to do VI, but they are very convenient and results in a fairly flexible approach. One distribution which has a density satisfying all of the above assumptions _except_ (7) (we'll get back to this in second) for any tractable and differentiable $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ is the good ole' Gaussian/normal distribution: + +::: {.column-page} +$$ +z \sim \mathcal{N}(\mu, \Sigma) \quad \iff \quad z = g\_{\mu, L}(\bar{z}) := \mu + L^T \tilde{z} \quad \text{where} \quad \bar{z} \sim \bar{q}(z) := \mathcal{N}(1\_d, I\_{d \times d}) +$$ +::: + +where $\Sigma = L L^T,$ with $L$ obtained from the Cholesky-decomposition. Abusing notation a bit, we're going to write + +::: {.column-page} +$$ +\theta = (\mu, \Sigma) := (\mu\_1, \dots, \mu\_d, L\_{11}, \dots, L\_{1, d}, L\_{2, 1}, \dots, L\_{2, d}, \dots, L\_{d, 1}, \dots, L\_{d, d}). +$$ +::: + +With this assumption we finally have a tractable expression for $\widehat{\mathrm{ELBO}}(q_{\mu, \Sigma})$! Well, assuming (7) is holds. Since a Gaussian has non-zero probability on the entirety of $\mathbb{R}^d$, we also require $p(z \mid \\{ x_i \\}_{i = 1}^n)$ to have non-zero probability on all of $\mathbb{R}^d$. + +Though not necessary, we'll often make a *mean-field* assumption for the variational posterior $q(z)$, i.e. assume independence between the latent variables. In this case, we'll write + +::: {.column-page} +$$ +\theta = (\mu, \sigma^2) := (\mu\_1, \dots, \mu\_d, \sigma\_1^2, \dots, \sigma\_d^2). +$$ +::: + +### Examples + +As a (trivial) example we could apply the approach described above to is the following generative model for $p(z \mid \\{ x_i \\}\_{i = 1}^n)$: + +::: {.column-page} +$$ +\begin{align*} + m &\sim \mathcal{N}(0, 1) \\ + x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, 1), \quad i = 1, \dots, n. +\end{align*} +$$ +::: + +In this case $z = m$ and we have the posterior defined $p(m \mid \\{ x\_i \\}\_{i = 1}^n) = p(m) \prod\_{i = 1}^n p(x\_i \mid m)$. Then the variational posterior would be + +::: {.column-page} +$$ +q\_{\mu, \sigma} = \mathcal{N}(\mu, \sigma^2), \quad \text{where} \quad \mu \in \mathbb{R}, \ \sigma^2 \in \mathbb{R}^{ + }. +$$ +::: + +And since prior of $m$, $\mathcal{N}(0, 1)$, has non-zero probability on the entirety of $\mathbb{R}$, same as $q(m)$, i.e. assumption (7) above holds, everything is fine and life is good. + +But what about this generative model for $p(z \mid \\{ x_i \\}_{i = 1}^n)$: + +::: {.column-page} +$$ +\begin{align*} + s &\sim \mathrm{InverseGamma}(2, 3), \\ + m &\sim \mathcal{N}(0, s), \\ + x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, s), \quad i = 1, \dots, n, +\end{align*} +$$ +::: + +with posterior $p(s, m \mid \\{ x\_i \\}\_{i = 1}^n) = p(s) p(m \mid s) \prod\_{i = 1}^n p(x\_i \mid s, m)$ and the mean-field variational posterior $q(s, m)$ will be + +::: {.column-page} +$$ +q\_{\mu\_1, \mu\_2, \sigma\_1^2, \sigma\_2^2}(s, m) = p\_{\mathcal{N}(\mu\_1, \sigma\_1^2)}(s)\ p\_{\mathcal{N}(\mu\_2, \sigma\_2^2)}(m), +$$ +::: + +where we've denoted the evaluation of the probability density of a Gaussian as $p_{\mathcal{N}(\mu, \sigma^2)}(x)$. + +Observe that $\mathrm{InverseGamma}(2, 3)$ has non-zero probability only on $\mathbb{R}^{ + } := (0, \infty)$ which is clearly not all of $\mathbb{R}$ like $q(s, m)$ has, i.e. + +::: {.column-page} +$$ +\mathrm{supp} \left( q(s, m) \right) \not\subseteq \mathrm{supp} \left( p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right). +$$ +::: + +Recall from the definition of the KL-divergence that when this is the case, the KL-divergence isn't well defined. This gets us to the *automatic* part of ADVI. + +### "Automatic"? How? + +For a lot of the standard (continuous) densities $p$ we can actually construct a probability density $\tilde{p}$ with non-zero probability on all of $\mathbb{R}$ by *transforming* the "constrained" probability density $p$ to $\tilde{p}$. In fact, in these cases this is a one-to-one relationship. As we'll see, this helps solve the support-issue we've been going on and on about. + +#### Transforming densities using change of variables + +If we want to compute the probability of $x$ taking a value in some set $A \subseteq \mathrm{supp} \left( p(x) \right)$, we have to integrate $p(x)$ over $A$, i.e. + +::: {.column-page} +$$ +\mathbb{P}_p(x \in A) = \int_A p(x) \mathrm{d}x. +$$ +::: + +This means that if we have a differentiable bijection $f: \mathrm{supp} \left( q(x) \right) \to \mathbb{R}^d$ with differentiable inverse $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(x) \right)$, we can perform a change of variables + +::: {.column-page} +$$ +\mathbb{P}\_p(x \in A) = \int\_{f^{-1}(A)} p \left(f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| \mathrm{d}y, +$$ +::: + +where $\mathcal{J}_{f^{-1}}(x)$ denotes the jacobian of $f^{-1}$ evaluated at $x$. Observe that this defines a probability distribution + +::: {.column-page} +$$ +\mathbb{P}\_{\tilde{p}}\left(y \in f^{-1}(A) \right) = \int\_{f^{-1}(A)} \tilde{p}(y) \mathrm{d}y, +$$ +::: + +since $f^{-1}\left(\mathrm{supp} (p(x)) \right) = \mathbb{R}^d$ which has probability 1. This probability distribution has *density* $\tilde{p}(y)$ with $\mathrm{supp} \left( \tilde{p}(y) \right) = \mathbb{R}^d$, defined + +::: {.column-page} +$$ +\tilde{p}(y) = p \left( f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| +$$ +::: + +or equivalently + +::: {.column-page} +$$ +\tilde{p} \left( f(x) \right) = \frac{p(x)}{\big| \det \mathcal{J}\_{f}(x) \big|} +$$ +::: + +due to the fact that + +::: {.column-page} +$$ +\big| \det \mathcal{J}\_{f^{-1}}(y) \big| = \big| \det \mathcal{J}\_{f}(x) \big|^{-1} +$$ +::: + +*Note: it's also necessary that the log-abs-det-jacobian term is non-vanishing. This can for example be accomplished by assuming $f$ to also be elementwise monotonic.* + +#### Back to VI + +So why is this is useful? Well, we're looking to generalize our approach using a normal distribution to cases where the supports don't match up. How about defining $q(z)$ by + +::: {.column-page} +$$ +\begin{align*} + \eta &\sim \mathcal{N}(\mu, \Sigma), \\\\ + z &= f^{-1}(\eta), +\end{align*} +$$ +::: + +where $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(z \mid x) \right)$ is a differentiable bijection with differentiable inverse. Then $z \sim q_{\mu, \Sigma}(z) \implies z \in \mathrm{supp} \left( p(z \mid x) \right)$ as we wanted. The resulting variational density is + +::: {.column-page} +$$ +q\_{\mu, \Sigma}(z) = p\_{\mathcal{N}(\mu, \Sigma)}\left( f(z) \right) \ \big| \det \mathcal{J}\_{f}(z) \big|. +$$ +::: + +Note that the way we've constructed $q(z)$ here is basically a reverse of the approach we described above. Here we sample from a distribution with support on $\mathbb{R}$ and transform *to* $\mathrm{supp} \left( p(z \mid x) \right)$. + +If we want to write the ELBO explicitly in terms of $\eta$ rather than $z$, the first term in the ELBO becomes + +::: {.column-page} +$$ +\begin{align*} + \mathbb{E}\_{z \sim q_{\mu, \Sigma}(z)} \left[ \log p(x\_i, z) \right] &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \Bigg[ \log \frac{p\left(x\_i, f^{-1}(\eta) \right)}{\big| \det \mathcal{J}_{f^{-1}}(\eta) \big|} \Bigg] \\ + &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \log p\left(x\_i, f^{-1}(\eta) \right) \right] - \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \left| \det \mathcal{J}\_{f^{-1}}(\eta) \right| \right]. +\end{align*} +$$ +::: + +The entropy is invariant under change of variables, thus $\mathbb{H} \left(q\_{\mu, \Sigma}(z)\right)$ is simply the entropy of the normal distribution which is known analytically. + +Hence, the resulting empirical estimate of the ELBO is + +::: {.column-page} +$$ +\begin{align*} +\widehat{\mathrm{ELBO}}(q\_{\mu, \Sigma}) &= \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \left(\log p\left(x\_i, f^{-1}(\eta_k)\right) - \log \big| \det \mathcal{J}\_{f^{-1}}(\eta\_k) \big| \right) \right) + \mathbb{H} \left(p\_{\mathcal{N}(\mu, \Sigma)}(z)\right) \\ +& \text{where} \quad z\_k \sim \mathcal{N}(\mu, \Sigma) \quad \forall k = 1, \dots, m +\end{align*}. +$$ +::: + +And maximizing this wrt. $\mu$ and $\Sigma$ is what's referred to as **Automatic Differentiation Variational Inference (ADVI)**! + +Now if you want to try it out, [check out the tutorial on how to use ADVI in Turing.jl]({{}})! From 254adc1f099ed133bf274cadafc41bd4ed41c51d Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Thu, 16 Jan 2025 11:27:34 +0000 Subject: [PATCH 3/4] unix2dos --- developers/compiler/design-overview/index.qmd | 612 +++++------ .../compiler/minituring-compiler/index.qmd | 590 +++++------ .../compiler/minituring-contexts/index.qmd | 612 +++++------ developers/contributing/index.qmd | 156 +-- .../abstractmcmc-interface/index.qmd | 646 ++++++------ .../inference/abstractmcmc-turing/index.qmd | 658 ++++++------ .../inference/implementing-samplers/index.qmd | 990 +++++++++--------- .../inference/variational-inference/index.qmd | 770 +++++++------- 8 files changed, 2517 insertions(+), 2517 deletions(-) diff --git a/developers/compiler/design-overview/index.qmd b/developers/compiler/design-overview/index.qmd index e389c44d0..3645422e7 100755 --- a/developers/compiler/design-overview/index.qmd +++ b/developers/compiler/design-overview/index.qmd @@ -1,306 +1,306 @@ ---- -title: Turing Compiler Design (Outdated) -engine: julia -aliases: - - ../../../tutorials/docs-05-for-developers-compiler/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this section, the current design of Turing's model "compiler" is described which enables Turing to perform various types of Bayesian inference without changing the model definition. The "compiler" is essentially just a macro that rewrites the user's model definition to a function that generates a `Model` struct that Julia's dispatch can operate on and that Julia's compiler can successfully do type inference on for efficient machine code generation. - -# Overview - -The following terminology will be used in this section: - - - `D`: observed data variables conditioned upon in the posterior, - - `P`: parameter variables distributed according to the prior distributions, these will also be referred to as random variables, - - `Model`: a fully defined probabilistic model with input data - -`Turing`'s `@model` macro rewrites the user-provided function definition such that it can be used to instantiate a `Model` by passing in the observed data `D`. - -The following are the main jobs of the `@model` macro: - - 1. Parse `~` and `.~` lines, e.g. `y .~ Normal.(c*x, 1.0)` - 2. Figure out if a variable belongs to the data `D` and or to the parameters `P` - 3. Enable the handling of missing data variables in `D` when defining a `Model` and treating them as parameter variables in `P` instead - 4. Enable the tracking of random variables using the data structures `VarName` and `VarInfo` - 5. Change `~`/`.~` lines with a variable in `P` on the LHS to a call to `tilde_assume` or `dot_tilde_assume` - 6. Change `~`/`.~` lines with a variable in `D` on the LHS to a call to `tilde_observe` or `dot_tilde_observe` - 7. Enable type stable automatic differentiation of the model using type parameters - -## The model - -A `model::Model` is a callable struct that one can sample from by calling - -```{julia} -#| eval: false -(model::Model)([rng, varinfo, sampler, context]) -``` - -where `rng` is a random number generator (default: `Random.default_rng()`), `varinfo` is a data structure that stores information -about the random variables (default: `DynamicPPL.VarInfo()`), `sampler` is a sampling algorithm (default: `DynamicPPL.SampleFromPrior()`), -and `context` is a sampling context that can, e.g., modify how the log probability is accumulated (default: `DynamicPPL.DefaultContext()`). - -Sampling resets the log joint probability of `varinfo` and increases the evaluation counter of `sampler`. If `context` is a `LikelihoodContext`, -only the log likelihood of `D` will be accumulated, whereas with `PriorContext` only the log prior probability of `P` is. With the `DefaultContext` the log joint probability of both `P` and `D` is accumulated. - -The `Model` struct contains the four internal fields `f`, `args`, `defaults`, and `context`. -When `model::Model` is called, then the internal function `model.f` is called as `model.f(rng, varinfo, sampler, context, model.args...)` -(for multithreaded sampling, instead of `varinfo` a threadsafe wrapper is passed to `model.f`). -The positional and keyword arguments that were passed to the user-defined model function when the model was created are saved as a `NamedTuple` -in `model.args`. The default values of the positional and keyword arguments of the user-defined model functions, if any, are saved as a `NamedTuple` -in `model.defaults`. They are used for constructing model instances with different arguments by the `logprob` and `prob` string macros. -The `context` variable sets an evaluation context that can be used to control for instance whether log probabilities should be evaluated for the prior, likelihood, or joint probability. By default it is set to evaluate the log joint. - -# Example - -Let's take the following model as an example: - -```{julia} -#| eval: false -@model function gauss( - x=missing, y=1.0, ::Type{TV}=Vector{Float64} -) where {TV<:AbstractVector} - if x === missing - x = TV(undef, 3) - end - p = TV(undef, 2) - p[1] ~ InverseGamma(2, 3) - p[2] ~ Normal(0, 1.0) - @. x[1:2] ~ Normal(p[2], sqrt(p[1])) - x[3] ~ Normal() - return y ~ Normal(p[2], sqrt(p[1])) -end -``` - -The above call of the `@model` macro defines the function `gauss` with positional arguments `x`, `y`, and `::Type{TV}`, rewritten in -such a way that every call of it returns a `model::Model`. Note that only the function body is modified by the `@model` macro, and the -function signature is left untouched. It is also possible to implement models with keyword arguments such as - -```{julia} -#| eval: false -@model function gauss( - ::Type{TV}=Vector{Float64}; x=missing, y=1.0 -) where {TV<:AbstractVector} - return ... -end -``` - -This would allow us to generate a model by calling `gauss(; x = rand(3))`. - -If an argument has a default value `missing`, it is treated as a random variable. For variables which require an initialization because we -need to loop or broadcast over its elements, such as `x` above, the following needs to be done: - -```{julia} -#| eval: false -if x === missing - x = ... -end -``` - -Note that since `gauss` behaves like a regular function it is possible to define additional dispatches in a second step as well. For -instance, we could achieve the same behaviour by - -```{julia} -#| eval: false -@model function gauss(x, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} - p = TV(undef, 2) - return ... -end - -function gauss(::Missing, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} - return gauss(TV(undef, 3), y, TV) -end -``` - -If `x` is sampled as a whole from a distribution and not indexed, e.g., `x ~ Normal(...)` or `x ~ MvNormal(...)`, -there is no need to initialize it in an `if`-block. - -## Step 1: Break up the model definition - -First, the `@model` macro breaks up the user-provided function definition using `DynamicPPL.build_model_info`. This function -returns a dictionary consisting of: - - - `allargs_exprs`: The expressions of the positional and keyword arguments, without default values. - - `allargs_syms`: The names of the positional and keyword arguments, e.g., `[:x, :y, :TV]` above. - - `allargs_namedtuple`: An expression that constructs a `NamedTuple` of the positional and keyword arguments, e.g., `:((x = x, y = y, TV = TV))` above. - - `defaults_namedtuple`: An expression that constructs a `NamedTuple` of the default positional and keyword arguments, if any, e.g., `:((x = missing, y = 1, TV = Vector{Float64}))` above. - - `modeldef`: A dictionary with the name, arguments, and function body of the model definition, as returned by `MacroTools.splitdef`. - -## Step 2: Generate the body of the internal model function - -In a second step, `DynamicPPL.generate_mainbody` generates the main part of the transformed function body using the user-provided function body -and the provided function arguments, without default values, for figuring out if a variable denotes an observation or a random variable. -Hereby the function `DynamicPPL.generate_tilde` replaces the `L ~ R` lines in the model and the function `DynamicPPL.generate_dot_tilde` replaces -the `@. L ~ R` and `L .~ R` lines in the model. - -In the above example, `p[1] ~ InverseGamma(2, 3)` is replaced with something similar to - -```{julia} -#| eval: false -#= REPL[25]:6 =# -begin - var"##tmpright#323" = InverseGamma(2, 3) - var"##tmpright#323" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( - ArgumentError( - "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", - ), - ) - var"##vn#325" = (DynamicPPL.VarName)(:p, ((1,),)) - var"##inds#326" = ((1,),) - p[1] = (DynamicPPL.tilde_assume)( - _rng, - _context, - _sampler, - var"##tmpright#323", - var"##vn#325", - var"##inds#326", - _varinfo, - ) -end -``` - -Here the first line is a so-called line number node that enables more helpful error messages by providing users with the exact location -of the error in their model definition. Then the right hand side (RHS) of the `~` is assigned to a variable (with an automatically generated name). -We check that the RHS is a distribution or an array of distributions, otherwise an error is thrown. -Next we extract a compact representation of the variable with its name and index (or indices). Finally, the `~` expression is replaced with -a call to `DynamicPPL.tilde_assume` since the compiler figured out that `p[1]` is a random variable using the following -heuristic: - - 1. If the symbol on the LHS of `~`, `:p` in this case, is not among the arguments to the model, `(:x, :y, :T)` in this case, it is a random variable. - 2. If the symbol on the LHS of `~`, `:p` in this case, is among the arguments to the model but has a value of `missing`, it is a random variable. - 3. If the value of the LHS of `~`, `p[1]` in this case, is `missing`, then it is a random variable. - 4. Otherwise, it is treated as an observation. - -The `DynamicPPL.tilde_assume` function takes care of sampling the random variable, if needed, and updating its value and the accumulated log joint -probability in the `_varinfo` object. If `L ~ R` is an observation, `DynamicPPL.tilde_observe` is called with the same arguments except the -random number generator `_rng` (since observations are never sampled). - -A similar transformation is performed for expressions of the form `@. L ~ R` and `L .~ R`. For instance, -`@. x[1:2] ~ Normal(p[2], sqrt(p[1]))` is replaced with - -```{julia} -#| eval: false -#= REPL[25]:8 =# -begin - var"##tmpright#331" = Normal.(p[2], sqrt.(p[1])) - var"##tmpright#331" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( - ArgumentError( - "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", - ), - ) - var"##vn#333" = (DynamicPPL.VarName)(:x, ((1:2,),)) - var"##inds#334" = ((1:2,),) - var"##isassumption#335" = begin - let var"##vn#336" = (DynamicPPL.VarName)(:x, ((1:2,),)) - if !((DynamicPPL.inargnames)(var"##vn#336", _model)) || - (DynamicPPL.inmissings)(var"##vn#336", _model) - true - else - x[1:2] === missing - end - end - end - if var"##isassumption#335" - x[1:2] .= (DynamicPPL.dot_tilde_assume)( - _rng, - _context, - _sampler, - var"##tmpright#331", - x[1:2], - var"##vn#333", - var"##inds#334", - _varinfo, - ) - else - (DynamicPPL.dot_tilde_observe)( - _context, - _sampler, - var"##tmpright#331", - x[1:2], - var"##vn#333", - var"##inds#334", - _varinfo, - ) - end -end -``` - -The main difference in the expanded code between `L ~ R` and `@. L ~ R` is that the former doesn't assume `L` to be defined, it can be a new Julia variable in the scope, while the latter assumes `L` already exists. Moreover, `DynamicPPL.dot_tilde_assume` and `DynamicPPL.dot_tilde_observe` are called -instead of `DynamicPPL.tilde_assume` and `DynamicPPL.tilde_observe`. - -## Step 3: Replace the user-provided function body - -Finally, we replace the user-provided function body using `DynamicPPL.build_output`. This function uses `MacroTools.combinedef` to reassemble -the user-provided function with a new function body. In the modified function body an anonymous function is created whose function body -was generated in step 2 above and whose arguments are - - - a random number generator `_rng`, - - a model `_model`, - - a datastructure `_varinfo`, - - a sampler `_sampler`, - - a sampling context `_context`, - - and all positional and keyword arguments of the user-provided model function as positional arguments - without any default values. Finally, in the new function body a `model::Model` with this anonymous function as internal function is returned. - -# `VarName` - -In order to track random variables in the sampling process, `Turing` uses the `VarName` struct which acts as a random variable identifier generated at runtime. The `VarName` of a random variable is generated from the expression on the LHS of a `~` statement when the symbol on the LHS is in the set `P` of unobserved random variables. Every `VarName` instance has a type parameter `sym` which is the symbol of the Julia variable in the model that the random variable belongs to. For example, `x[1] ~ Normal()` will generate an instance of `VarName{:x}` assuming `x` is an unobserved random variable. Every `VarName` also has a field `indexing`, which stores the indices required to access the random variable from the Julia variable indicated by `sym` as a tuple of tuples. Each element of the tuple thereby contains the indices of one indexing operation (`VarName` also supports hierarchical arrays and range indexing). Some examples: - - - `x ~ Normal()` will generate a `VarName(:x, ())`. - - `x[1] ~ Normal()` will generate a `VarName(:x, ((1,),))`. - - `x[:,1] ~ MvNormal(zeros(2), I)` will generate a `VarName(:x, ((Colon(), 1),))`. - - `x[:,1][1+1] ~ Normal()` will generate a `VarName(:x, ((Colon(), 1), (2,)))`. - -The easiest way to manually construct a `VarName` is to use the `@varname` macro on an indexing expression, which will take the `sym` value from the actual variable name, and put the index values appropriately into the constructor. - -# `VarInfo` - -## Overview - -`VarInfo` is the data structure in `Turing` that facilitates tracking random variables and certain metadata about them that are required for sampling. For instance, the distribution of every random variable is stored in `VarInfo` because we need to know the support of every random variable when sampling using HMC for example. Random variables whose distributions have a constrained support are transformed using a bijector from [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl) so that the sampling happens in the unconstrained space. Different samplers require different metadata about the random variables. - -The definition of `VarInfo` in `Turing` is: - -```{julia} -#| eval: false -struct VarInfo{Tmeta, Tlogp} <: AbstractVarInfo - metadata::Tmeta - logp::Base.RefValue{Tlogp} - num_produce::Base.RefValue{Int} -end -``` - -Based on the type of `metadata`, the `VarInfo` is either aliased `UntypedVarInfo` or `TypedVarInfo`. `metadata` can be either a subtype of the union type `Metadata` or a `NamedTuple` of multiple such subtypes. Let `vi` be an instance of `VarInfo`. If `vi isa VarInfo{<:Metadata}`, then it is called an `UntypedVarInfo`. If `vi isa VarInfo{<:NamedTuple}`, then `vi.metadata` would be a `NamedTuple` mapping each symbol in `P` to an instance of `Metadata`. `vi` would then be called a `TypedVarInfo`. The other fields of `VarInfo` include `logp` which is used to accumulate the log probability or log probability density of the variables in `P` and `D`. `num_produce` keeps track of how many observations have been made in the model so far. This is incremented when running a `~` statement when the symbol on the LHS is in `D`. - -## `Metadata` - -The `Metadata` struct stores some metadata about the random variables sampled. This helps -query certain information about a variable such as: its distribution, which samplers -sample this variable, its value and whether this value is transformed to real space or -not. Let `md` be an instance of `Metadata`: - - - `md.vns` is the vector of all `VarName` instances. Let `vn` be an arbitrary element of `md.vns` - - `md.idcs` is the dictionary that maps each `VarName` instance to its index in - `md.vns`, `md.ranges`, `md.dists`, `md.orders` and `md.flags`. - - `md.vns[md.idcs[vn]] == vn`. - - `md.dists[md.idcs[vn]]` is the distribution of `vn`. - - `md.gids[md.idcs[vn]]` is the set of algorithms used to sample `vn`. This is used in - the Gibbs sampling process. - - `md.orders[md.idcs[vn]]` is the number of `observe` statements before `vn` is sampled. - - `md.ranges[md.idcs[vn]]` is the index range of `vn` in `md.vals`. - - `md.vals[md.ranges[md.idcs[vn]]]` is the linearized vector of values of corresponding to `vn`. - - `md.flags` is a dictionary of true/false flags. `md.flags[flag][md.idcs[vn]]` is the - value of `flag` corresponding to `vn`. - -Note that in order to make `md::Metadata` type stable, all the `md.vns` must have the same symbol and distribution type. However, one can have a single Julia variable, e.g. `x`, that is a matrix or a hierarchical array sampled in partitions, e.g. `x[1][:] ~ MvNormal(zeros(2), I); x[2][:] ~ MvNormal(ones(2), I)`. The symbol `x` can still be managed by a single `md::Metadata` without hurting the type stability since all the distributions on the RHS of `~` are of the same type. - -However, in `Turing` models one cannot have this restriction, so we must use a type unstable `Metadata` if we want to use one `Metadata` instance for the whole model. This is what `UntypedVarInfo` does. A type unstable `Metadata` will still work but will have inferior performance. - -To strike a balance between flexibility and performance when constructing the `spl::Sampler` instance, the model is first run by sampling the parameters in `P` from their priors using an `UntypedVarInfo`, i.e. a type unstable `Metadata` is used for all the variables. Then once all the symbols and distribution types have been identified, a `vi::TypedVarInfo` is constructed where `vi.metadata` is a `NamedTuple` mapping each symbol in `P` to a specialized instance of `Metadata`. So as long as each symbol in `P` is sampled from only one type of distributions, `vi::TypedVarInfo` will have fully concretely typed fields which brings out the peak performance of Julia. +--- +title: Turing Compiler Design (Outdated) +engine: julia +aliases: + - ../../../tutorials/docs-05-for-developers-compiler/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this section, the current design of Turing's model "compiler" is described which enables Turing to perform various types of Bayesian inference without changing the model definition. The "compiler" is essentially just a macro that rewrites the user's model definition to a function that generates a `Model` struct that Julia's dispatch can operate on and that Julia's compiler can successfully do type inference on for efficient machine code generation. + +# Overview + +The following terminology will be used in this section: + + - `D`: observed data variables conditioned upon in the posterior, + - `P`: parameter variables distributed according to the prior distributions, these will also be referred to as random variables, + - `Model`: a fully defined probabilistic model with input data + +`Turing`'s `@model` macro rewrites the user-provided function definition such that it can be used to instantiate a `Model` by passing in the observed data `D`. + +The following are the main jobs of the `@model` macro: + + 1. Parse `~` and `.~` lines, e.g. `y .~ Normal.(c*x, 1.0)` + 2. Figure out if a variable belongs to the data `D` and or to the parameters `P` + 3. Enable the handling of missing data variables in `D` when defining a `Model` and treating them as parameter variables in `P` instead + 4. Enable the tracking of random variables using the data structures `VarName` and `VarInfo` + 5. Change `~`/`.~` lines with a variable in `P` on the LHS to a call to `tilde_assume` or `dot_tilde_assume` + 6. Change `~`/`.~` lines with a variable in `D` on the LHS to a call to `tilde_observe` or `dot_tilde_observe` + 7. Enable type stable automatic differentiation of the model using type parameters + +## The model + +A `model::Model` is a callable struct that one can sample from by calling + +```{julia} +#| eval: false +(model::Model)([rng, varinfo, sampler, context]) +``` + +where `rng` is a random number generator (default: `Random.default_rng()`), `varinfo` is a data structure that stores information +about the random variables (default: `DynamicPPL.VarInfo()`), `sampler` is a sampling algorithm (default: `DynamicPPL.SampleFromPrior()`), +and `context` is a sampling context that can, e.g., modify how the log probability is accumulated (default: `DynamicPPL.DefaultContext()`). + +Sampling resets the log joint probability of `varinfo` and increases the evaluation counter of `sampler`. If `context` is a `LikelihoodContext`, +only the log likelihood of `D` will be accumulated, whereas with `PriorContext` only the log prior probability of `P` is. With the `DefaultContext` the log joint probability of both `P` and `D` is accumulated. + +The `Model` struct contains the four internal fields `f`, `args`, `defaults`, and `context`. +When `model::Model` is called, then the internal function `model.f` is called as `model.f(rng, varinfo, sampler, context, model.args...)` +(for multithreaded sampling, instead of `varinfo` a threadsafe wrapper is passed to `model.f`). +The positional and keyword arguments that were passed to the user-defined model function when the model was created are saved as a `NamedTuple` +in `model.args`. The default values of the positional and keyword arguments of the user-defined model functions, if any, are saved as a `NamedTuple` +in `model.defaults`. They are used for constructing model instances with different arguments by the `logprob` and `prob` string macros. +The `context` variable sets an evaluation context that can be used to control for instance whether log probabilities should be evaluated for the prior, likelihood, or joint probability. By default it is set to evaluate the log joint. + +# Example + +Let's take the following model as an example: + +```{julia} +#| eval: false +@model function gauss( + x=missing, y=1.0, ::Type{TV}=Vector{Float64} +) where {TV<:AbstractVector} + if x === missing + x = TV(undef, 3) + end + p = TV(undef, 2) + p[1] ~ InverseGamma(2, 3) + p[2] ~ Normal(0, 1.0) + @. x[1:2] ~ Normal(p[2], sqrt(p[1])) + x[3] ~ Normal() + return y ~ Normal(p[2], sqrt(p[1])) +end +``` + +The above call of the `@model` macro defines the function `gauss` with positional arguments `x`, `y`, and `::Type{TV}`, rewritten in +such a way that every call of it returns a `model::Model`. Note that only the function body is modified by the `@model` macro, and the +function signature is left untouched. It is also possible to implement models with keyword arguments such as + +```{julia} +#| eval: false +@model function gauss( + ::Type{TV}=Vector{Float64}; x=missing, y=1.0 +) where {TV<:AbstractVector} + return ... +end +``` + +This would allow us to generate a model by calling `gauss(; x = rand(3))`. + +If an argument has a default value `missing`, it is treated as a random variable. For variables which require an initialization because we +need to loop or broadcast over its elements, such as `x` above, the following needs to be done: + +```{julia} +#| eval: false +if x === missing + x = ... +end +``` + +Note that since `gauss` behaves like a regular function it is possible to define additional dispatches in a second step as well. For +instance, we could achieve the same behaviour by + +```{julia} +#| eval: false +@model function gauss(x, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} + p = TV(undef, 2) + return ... +end + +function gauss(::Missing, y=1.0, ::Type{TV}=Vector{Float64}) where {TV<:AbstractVector} + return gauss(TV(undef, 3), y, TV) +end +``` + +If `x` is sampled as a whole from a distribution and not indexed, e.g., `x ~ Normal(...)` or `x ~ MvNormal(...)`, +there is no need to initialize it in an `if`-block. + +## Step 1: Break up the model definition + +First, the `@model` macro breaks up the user-provided function definition using `DynamicPPL.build_model_info`. This function +returns a dictionary consisting of: + + - `allargs_exprs`: The expressions of the positional and keyword arguments, without default values. + - `allargs_syms`: The names of the positional and keyword arguments, e.g., `[:x, :y, :TV]` above. + - `allargs_namedtuple`: An expression that constructs a `NamedTuple` of the positional and keyword arguments, e.g., `:((x = x, y = y, TV = TV))` above. + - `defaults_namedtuple`: An expression that constructs a `NamedTuple` of the default positional and keyword arguments, if any, e.g., `:((x = missing, y = 1, TV = Vector{Float64}))` above. + - `modeldef`: A dictionary with the name, arguments, and function body of the model definition, as returned by `MacroTools.splitdef`. + +## Step 2: Generate the body of the internal model function + +In a second step, `DynamicPPL.generate_mainbody` generates the main part of the transformed function body using the user-provided function body +and the provided function arguments, without default values, for figuring out if a variable denotes an observation or a random variable. +Hereby the function `DynamicPPL.generate_tilde` replaces the `L ~ R` lines in the model and the function `DynamicPPL.generate_dot_tilde` replaces +the `@. L ~ R` and `L .~ R` lines in the model. + +In the above example, `p[1] ~ InverseGamma(2, 3)` is replaced with something similar to + +```{julia} +#| eval: false +#= REPL[25]:6 =# +begin + var"##tmpright#323" = InverseGamma(2, 3) + var"##tmpright#323" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( + ArgumentError( + "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", + ), + ) + var"##vn#325" = (DynamicPPL.VarName)(:p, ((1,),)) + var"##inds#326" = ((1,),) + p[1] = (DynamicPPL.tilde_assume)( + _rng, + _context, + _sampler, + var"##tmpright#323", + var"##vn#325", + var"##inds#326", + _varinfo, + ) +end +``` + +Here the first line is a so-called line number node that enables more helpful error messages by providing users with the exact location +of the error in their model definition. Then the right hand side (RHS) of the `~` is assigned to a variable (with an automatically generated name). +We check that the RHS is a distribution or an array of distributions, otherwise an error is thrown. +Next we extract a compact representation of the variable with its name and index (or indices). Finally, the `~` expression is replaced with +a call to `DynamicPPL.tilde_assume` since the compiler figured out that `p[1]` is a random variable using the following +heuristic: + + 1. If the symbol on the LHS of `~`, `:p` in this case, is not among the arguments to the model, `(:x, :y, :T)` in this case, it is a random variable. + 2. If the symbol on the LHS of `~`, `:p` in this case, is among the arguments to the model but has a value of `missing`, it is a random variable. + 3. If the value of the LHS of `~`, `p[1]` in this case, is `missing`, then it is a random variable. + 4. Otherwise, it is treated as an observation. + +The `DynamicPPL.tilde_assume` function takes care of sampling the random variable, if needed, and updating its value and the accumulated log joint +probability in the `_varinfo` object. If `L ~ R` is an observation, `DynamicPPL.tilde_observe` is called with the same arguments except the +random number generator `_rng` (since observations are never sampled). + +A similar transformation is performed for expressions of the form `@. L ~ R` and `L .~ R`. For instance, +`@. x[1:2] ~ Normal(p[2], sqrt(p[1]))` is replaced with + +```{julia} +#| eval: false +#= REPL[25]:8 =# +begin + var"##tmpright#331" = Normal.(p[2], sqrt.(p[1])) + var"##tmpright#331" isa Union{Distribution,AbstractVector{<:Distribution}} || throw( + ArgumentError( + "Right-hand side of a ~ must be subtype of Distribution or a vector of Distributions.", + ), + ) + var"##vn#333" = (DynamicPPL.VarName)(:x, ((1:2,),)) + var"##inds#334" = ((1:2,),) + var"##isassumption#335" = begin + let var"##vn#336" = (DynamicPPL.VarName)(:x, ((1:2,),)) + if !((DynamicPPL.inargnames)(var"##vn#336", _model)) || + (DynamicPPL.inmissings)(var"##vn#336", _model) + true + else + x[1:2] === missing + end + end + end + if var"##isassumption#335" + x[1:2] .= (DynamicPPL.dot_tilde_assume)( + _rng, + _context, + _sampler, + var"##tmpright#331", + x[1:2], + var"##vn#333", + var"##inds#334", + _varinfo, + ) + else + (DynamicPPL.dot_tilde_observe)( + _context, + _sampler, + var"##tmpright#331", + x[1:2], + var"##vn#333", + var"##inds#334", + _varinfo, + ) + end +end +``` + +The main difference in the expanded code between `L ~ R` and `@. L ~ R` is that the former doesn't assume `L` to be defined, it can be a new Julia variable in the scope, while the latter assumes `L` already exists. Moreover, `DynamicPPL.dot_tilde_assume` and `DynamicPPL.dot_tilde_observe` are called +instead of `DynamicPPL.tilde_assume` and `DynamicPPL.tilde_observe`. + +## Step 3: Replace the user-provided function body + +Finally, we replace the user-provided function body using `DynamicPPL.build_output`. This function uses `MacroTools.combinedef` to reassemble +the user-provided function with a new function body. In the modified function body an anonymous function is created whose function body +was generated in step 2 above and whose arguments are + + - a random number generator `_rng`, + - a model `_model`, + - a datastructure `_varinfo`, + - a sampler `_sampler`, + - a sampling context `_context`, + - and all positional and keyword arguments of the user-provided model function as positional arguments + without any default values. Finally, in the new function body a `model::Model` with this anonymous function as internal function is returned. + +# `VarName` + +In order to track random variables in the sampling process, `Turing` uses the `VarName` struct which acts as a random variable identifier generated at runtime. The `VarName` of a random variable is generated from the expression on the LHS of a `~` statement when the symbol on the LHS is in the set `P` of unobserved random variables. Every `VarName` instance has a type parameter `sym` which is the symbol of the Julia variable in the model that the random variable belongs to. For example, `x[1] ~ Normal()` will generate an instance of `VarName{:x}` assuming `x` is an unobserved random variable. Every `VarName` also has a field `indexing`, which stores the indices required to access the random variable from the Julia variable indicated by `sym` as a tuple of tuples. Each element of the tuple thereby contains the indices of one indexing operation (`VarName` also supports hierarchical arrays and range indexing). Some examples: + + - `x ~ Normal()` will generate a `VarName(:x, ())`. + - `x[1] ~ Normal()` will generate a `VarName(:x, ((1,),))`. + - `x[:,1] ~ MvNormal(zeros(2), I)` will generate a `VarName(:x, ((Colon(), 1),))`. + - `x[:,1][1+1] ~ Normal()` will generate a `VarName(:x, ((Colon(), 1), (2,)))`. + +The easiest way to manually construct a `VarName` is to use the `@varname` macro on an indexing expression, which will take the `sym` value from the actual variable name, and put the index values appropriately into the constructor. + +# `VarInfo` + +## Overview + +`VarInfo` is the data structure in `Turing` that facilitates tracking random variables and certain metadata about them that are required for sampling. For instance, the distribution of every random variable is stored in `VarInfo` because we need to know the support of every random variable when sampling using HMC for example. Random variables whose distributions have a constrained support are transformed using a bijector from [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl) so that the sampling happens in the unconstrained space. Different samplers require different metadata about the random variables. + +The definition of `VarInfo` in `Turing` is: + +```{julia} +#| eval: false +struct VarInfo{Tmeta, Tlogp} <: AbstractVarInfo + metadata::Tmeta + logp::Base.RefValue{Tlogp} + num_produce::Base.RefValue{Int} +end +``` + +Based on the type of `metadata`, the `VarInfo` is either aliased `UntypedVarInfo` or `TypedVarInfo`. `metadata` can be either a subtype of the union type `Metadata` or a `NamedTuple` of multiple such subtypes. Let `vi` be an instance of `VarInfo`. If `vi isa VarInfo{<:Metadata}`, then it is called an `UntypedVarInfo`. If `vi isa VarInfo{<:NamedTuple}`, then `vi.metadata` would be a `NamedTuple` mapping each symbol in `P` to an instance of `Metadata`. `vi` would then be called a `TypedVarInfo`. The other fields of `VarInfo` include `logp` which is used to accumulate the log probability or log probability density of the variables in `P` and `D`. `num_produce` keeps track of how many observations have been made in the model so far. This is incremented when running a `~` statement when the symbol on the LHS is in `D`. + +## `Metadata` + +The `Metadata` struct stores some metadata about the random variables sampled. This helps +query certain information about a variable such as: its distribution, which samplers +sample this variable, its value and whether this value is transformed to real space or +not. Let `md` be an instance of `Metadata`: + + - `md.vns` is the vector of all `VarName` instances. Let `vn` be an arbitrary element of `md.vns` + - `md.idcs` is the dictionary that maps each `VarName` instance to its index in + `md.vns`, `md.ranges`, `md.dists`, `md.orders` and `md.flags`. + - `md.vns[md.idcs[vn]] == vn`. + - `md.dists[md.idcs[vn]]` is the distribution of `vn`. + - `md.gids[md.idcs[vn]]` is the set of algorithms used to sample `vn`. This is used in + the Gibbs sampling process. + - `md.orders[md.idcs[vn]]` is the number of `observe` statements before `vn` is sampled. + - `md.ranges[md.idcs[vn]]` is the index range of `vn` in `md.vals`. + - `md.vals[md.ranges[md.idcs[vn]]]` is the linearized vector of values of corresponding to `vn`. + - `md.flags` is a dictionary of true/false flags. `md.flags[flag][md.idcs[vn]]` is the + value of `flag` corresponding to `vn`. + +Note that in order to make `md::Metadata` type stable, all the `md.vns` must have the same symbol and distribution type. However, one can have a single Julia variable, e.g. `x`, that is a matrix or a hierarchical array sampled in partitions, e.g. `x[1][:] ~ MvNormal(zeros(2), I); x[2][:] ~ MvNormal(ones(2), I)`. The symbol `x` can still be managed by a single `md::Metadata` without hurting the type stability since all the distributions on the RHS of `~` are of the same type. + +However, in `Turing` models one cannot have this restriction, so we must use a type unstable `Metadata` if we want to use one `Metadata` instance for the whole model. This is what `UntypedVarInfo` does. A type unstable `Metadata` will still work but will have inferior performance. + +To strike a balance between flexibility and performance when constructing the `spl::Sampler` instance, the model is first run by sampling the parameters in `P` from their priors using an `UntypedVarInfo`, i.e. a type unstable `Metadata` is used for all the variables. Then once all the symbols and distribution types have been identified, a `vi::TypedVarInfo` is constructed where `vi.metadata` is a `NamedTuple` mapping each symbol in `P` to a specialized instance of `Metadata`. So as long as each symbol in `P` is sampled from only one type of distributions, `vi::TypedVarInfo` will have fully concretely typed fields which brings out the peak performance of Julia. diff --git a/developers/compiler/minituring-compiler/index.qmd b/developers/compiler/minituring-compiler/index.qmd index c22894b17..605698694 100755 --- a/developers/compiler/minituring-compiler/index.qmd +++ b/developers/compiler/minituring-compiler/index.qmd @@ -1,295 +1,295 @@ ---- -title: "A Mini Turing Implementation I: Compiler" -engine: julia -aliases: - - ../../../tutorials/14-minituring/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this tutorial we develop a very simple probabilistic programming language. -The implementation is similar to [DynamicPPL](https://github.com/TuringLang/DynamicPPL.jl). -This is intentional as we want to demonstrate some key ideas from Turing's internal implementation. - -To make things easy to understand and to implement we restrict our language to a very simple subset of the language that Turing actually supports. -Defining an accurate syntax description is not our goal here, instead, we give a simple example and all similar programs should work. - -# Consider a probabilistic model defined by - -$$ -\begin{aligned} -a &\sim \operatorname{Normal}(0.5, 1^2) \\ -b &\sim \operatorname{Normal}(a, 2^2) \\ -x &\sim \operatorname{Normal}(b, 0.5^2) -\end{aligned} -$$ - -We assume that `x` is data, i.e., an observed variable. -In our small language this model will be defined as - -```{julia} -#| eval: false -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end -``` - -Specifically, we demand that - - - all observed variables are arguments of the program, - - the model definition does not contain any control flow, - - all variables are scalars, and - - the function returns `nothing`. - -First, we import some required packages: - -```{julia} -using MacroTools, Distributions, Random, AbstractMCMC, MCMCChains -``` - -Before getting to the actual "compiler", we first build the data structure for the program trace. -A program trace for a probabilistic programming language needs to at least record the values of stochastic variables and their log-probabilities. - -```{julia} -struct VarInfo{V,L} - values::V - logps::L -end - -VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) - -function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) - varinfo.values[var_id] = value - varinfo.logps[var_id] = logp - return varinfo -end -``` - -Internally, our probabilistic programming language works with two main functions: - - - `assume` for sampling unobserved variables and computing their log-probabilities, and - - `observe` for computing log-probabilities of observed variables (but not sampling them). - -For different inference algorithms we may have to use different sampling procedures and different log-probability computations. -For instance, in some cases we might want to sample all variables from their prior distributions and in other cases we might only want to compute the log-likelihood of the observations based on a given set of values for the unobserved variables. -Thus depending on the inference algorithm we want to use different `assume` and `observe` implementations. -We can achieve this by providing this `context` information as a function argument to `assume` and `observe`. - -**Note:** *Although the context system in this tutorial is inspired by DynamicPPL, it is very simplistic. -We expand this mini Turing example in the [contexts]({{}}) tutorial with some more complexity, to illustrate how and why contexts are central to Turing's design. For the full details one still needs to go to the actual source of DynamicPPL though.* - -Here we can see the implementation of a sampler that draws values of unobserved variables from the prior and computes the log-probability for every variable. - -```{julia} -struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} - rng::R - sampler::S -end - -struct PriorSampler <: AbstractMCMC.AbstractSampler end - -function observe(context::SamplingContext, varinfo, dist, var_id, var_value) - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return nothing -end - -function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) - sample = Random.rand(context.rng, dist) - logp = logpdf(dist, sample) - varinfo[var_id] = (sample, logp) - return sample -end; -``` - -Next we define the "compiler" for our simple programming language. -The term compiler is actually a bit misleading here since its only purpose is to transform the function definition in the `@mini_model` macro by - - - adding the context information (`context`) and the tracing data structure (`varinfo`) as additional arguments, and - - replacing tildes with calls to `assume` and `observe`. - -Afterwards, as usual the Julia compiler will just-in-time compile the model function when it is called. - -The manipulation of Julia expressions is an advanced part of the Julia language. -The [Julia documentation](https://docs.julialang.org/en/v1/manual/metaprogramming/) provides an introduction to and more details about this so-called metaprogramming. - -```{julia} -macro mini_model(expr) - return esc(mini_model(expr)) -end - -function mini_model(expr) - # Split the function definition into a dictionary with its name, arguments, body etc. - def = MacroTools.splitdef(expr) - - # Replace tildes in the function body with calls to `assume` or `observe` - def[:body] = MacroTools.postwalk(def[:body]) do sub_expr - if MacroTools.@capture(sub_expr, var_ ~ dist_) - if var in def[:args] - # If the variable is an argument of the model function, it is observed - return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) - else - # Otherwise it is unobserved - return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) - end - else - return sub_expr - end - end - - # Add `context` and `varinfo` arguments to the model function - def[:args] = vcat(:varinfo, :context, def[:args]) - - # Reassemble the function definition from its name, arguments, body etc. - return MacroTools.combinedef(def) -end; -``` - -For inference, we make use of the [AbstractMCMC interface](https://turinglang.github.io/AbstractMCMC.jl/dev/). -It provides a default implementation of a `sample` function for sampling a Markov chain. -The default implementation already supports e.g. sampling of multiple chains in parallel, thinning of samples, or discarding initial samples. - -The AbstractMCMC interface requires us to at least - - - define a model that is a subtype of `AbstractMCMC.AbstractModel`, - - define a sampler that is a subtype of `AbstractMCMC.AbstractSampler`, - - implement `AbstractMCMC.step` for our model and sampler. - -Thus here we define a `MiniModel` model. -In this model we store the model function and the observed data. - -```{julia} -struct MiniModel{F,D} <: AbstractMCMC.AbstractModel - f::F - data::D # a NamedTuple of all the data -end -``` - -In the Turing compiler, the model-specific `DynamicPPL.Model` is constructed automatically when calling the model function. -But for the sake of simplicity here we construct the model manually. - -To illustrate probabilistic inference with our mini language we implement an extremely simplistic Random-Walk Metropolis-Hastings sampler. -We hard-code the proposal step as part of the sampler and only allow normal distributions with zero mean and fixed standard deviation. -The Metropolis-Hastings sampler in Turing is more flexible. - -```{julia} -struct MHSampler{T<:Real} <: AbstractMCMC.AbstractSampler - sigma::T -end - -MHSampler() = MHSampler(1) - -function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) - sampler = context.sampler - old_value = varinfo.values[var_id] - - # propose a random-walk step, i.e, add the current value to a random - # value sampled from a Normal distribution centered at 0 - value = rand(context.rng, Normal(old_value, sampler.sigma)) - logp = Distributions.logpdf(dist, value) - varinfo[var_id] = (value, logp) - - return value -end; -``` - -We need to define two `step` functions, one for the first step and the other for the following steps. -In the first step we sample values from the prior distributions and in the following steps we sample with the random-walk proposal. -The two functions are identified by the different arguments they take. - -```{julia} -# The fist step: Sampling from the prior distributions -function AbstractMCMC.step( - rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... -) - vi = VarInfo() - ctx = SamplingContext(rng, PriorSampler()) - model.f(vi, ctx, values(model.data)...) - return vi, vi -end - -# The following steps: Sampling with random-walk proposal -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model::MiniModel, - sampler::MHSampler, - prev_state::VarInfo; # is just the old trace - kwargs..., -) - vi = prev_state - new_vi = deepcopy(vi) - ctx = SamplingContext(rng, sampler) - model.f(new_vi, ctx, values(model.data)...) - - # Compute log acceptance probability - # Since the proposal is symmetric the computation can be simplified - logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) - - # Accept proposal with computed acceptance probability - if -randexp(rng) < logα - return new_vi, new_vi - else - return prev_state, prev_state - end -end; -``` - -To make it easier to analyze the samples and compare them with results from Turing, additionally we define a version of `AbstractMCMC.bundle_samples` for our model and sampler that returns a `MCMCChains.Chains` object of samples. - -```{julia} -function AbstractMCMC.bundle_samples( - samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... -) - # We get a vector of traces - values = [sample.values for sample in samples] - params = [key for key in keys(values[1]) if key ∉ keys(model.data)] - vals = reduce(hcat, [value[p] for value in values] for p in params) - # Composing the `Chains` data-structure, of which analyzing infrastructure is provided - chains = Chains(vals, params) - return chains -end; -``` - -Let us check how our mini probabilistic programming language works. -We define the probabilistic model: - -```{julia} -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end; -``` - -We perform inference with data `x = 3.0`: - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(), 1_000_000; chain_type=Chains, progress=false) -``` - -We compare these results with Turing. - -```{julia} -using Turing -using PDMats - -@model function turing_m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end - -sample(turing_m(3.0), MH(ScalMat(2, 1.0)), 1_000_000, progress=false) -``` - -As you can see, with our simple probabilistic programming language and custom samplers we get similar results as Turing. +--- +title: "A Mini Turing Implementation I: Compiler" +engine: julia +aliases: + - ../../../tutorials/14-minituring/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this tutorial we develop a very simple probabilistic programming language. +The implementation is similar to [DynamicPPL](https://github.com/TuringLang/DynamicPPL.jl). +This is intentional as we want to demonstrate some key ideas from Turing's internal implementation. + +To make things easy to understand and to implement we restrict our language to a very simple subset of the language that Turing actually supports. +Defining an accurate syntax description is not our goal here, instead, we give a simple example and all similar programs should work. + +# Consider a probabilistic model defined by + +$$ +\begin{aligned} +a &\sim \operatorname{Normal}(0.5, 1^2) \\ +b &\sim \operatorname{Normal}(a, 2^2) \\ +x &\sim \operatorname{Normal}(b, 0.5^2) +\end{aligned} +$$ + +We assume that `x` is data, i.e., an observed variable. +In our small language this model will be defined as + +```{julia} +#| eval: false +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end +``` + +Specifically, we demand that + + - all observed variables are arguments of the program, + - the model definition does not contain any control flow, + - all variables are scalars, and + - the function returns `nothing`. + +First, we import some required packages: + +```{julia} +using MacroTools, Distributions, Random, AbstractMCMC, MCMCChains +``` + +Before getting to the actual "compiler", we first build the data structure for the program trace. +A program trace for a probabilistic programming language needs to at least record the values of stochastic variables and their log-probabilities. + +```{julia} +struct VarInfo{V,L} + values::V + logps::L +end + +VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) + +function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) + varinfo.values[var_id] = value + varinfo.logps[var_id] = logp + return varinfo +end +``` + +Internally, our probabilistic programming language works with two main functions: + + - `assume` for sampling unobserved variables and computing their log-probabilities, and + - `observe` for computing log-probabilities of observed variables (but not sampling them). + +For different inference algorithms we may have to use different sampling procedures and different log-probability computations. +For instance, in some cases we might want to sample all variables from their prior distributions and in other cases we might only want to compute the log-likelihood of the observations based on a given set of values for the unobserved variables. +Thus depending on the inference algorithm we want to use different `assume` and `observe` implementations. +We can achieve this by providing this `context` information as a function argument to `assume` and `observe`. + +**Note:** *Although the context system in this tutorial is inspired by DynamicPPL, it is very simplistic. +We expand this mini Turing example in the [contexts]({{}}) tutorial with some more complexity, to illustrate how and why contexts are central to Turing's design. For the full details one still needs to go to the actual source of DynamicPPL though.* + +Here we can see the implementation of a sampler that draws values of unobserved variables from the prior and computes the log-probability for every variable. + +```{julia} +struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} + rng::R + sampler::S +end + +struct PriorSampler <: AbstractMCMC.AbstractSampler end + +function observe(context::SamplingContext, varinfo, dist, var_id, var_value) + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return nothing +end + +function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) + sample = Random.rand(context.rng, dist) + logp = logpdf(dist, sample) + varinfo[var_id] = (sample, logp) + return sample +end; +``` + +Next we define the "compiler" for our simple programming language. +The term compiler is actually a bit misleading here since its only purpose is to transform the function definition in the `@mini_model` macro by + + - adding the context information (`context`) and the tracing data structure (`varinfo`) as additional arguments, and + - replacing tildes with calls to `assume` and `observe`. + +Afterwards, as usual the Julia compiler will just-in-time compile the model function when it is called. + +The manipulation of Julia expressions is an advanced part of the Julia language. +The [Julia documentation](https://docs.julialang.org/en/v1/manual/metaprogramming/) provides an introduction to and more details about this so-called metaprogramming. + +```{julia} +macro mini_model(expr) + return esc(mini_model(expr)) +end + +function mini_model(expr) + # Split the function definition into a dictionary with its name, arguments, body etc. + def = MacroTools.splitdef(expr) + + # Replace tildes in the function body with calls to `assume` or `observe` + def[:body] = MacroTools.postwalk(def[:body]) do sub_expr + if MacroTools.@capture(sub_expr, var_ ~ dist_) + if var in def[:args] + # If the variable is an argument of the model function, it is observed + return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) + else + # Otherwise it is unobserved + return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) + end + else + return sub_expr + end + end + + # Add `context` and `varinfo` arguments to the model function + def[:args] = vcat(:varinfo, :context, def[:args]) + + # Reassemble the function definition from its name, arguments, body etc. + return MacroTools.combinedef(def) +end; +``` + +For inference, we make use of the [AbstractMCMC interface](https://turinglang.github.io/AbstractMCMC.jl/dev/). +It provides a default implementation of a `sample` function for sampling a Markov chain. +The default implementation already supports e.g. sampling of multiple chains in parallel, thinning of samples, or discarding initial samples. + +The AbstractMCMC interface requires us to at least + + - define a model that is a subtype of `AbstractMCMC.AbstractModel`, + - define a sampler that is a subtype of `AbstractMCMC.AbstractSampler`, + - implement `AbstractMCMC.step` for our model and sampler. + +Thus here we define a `MiniModel` model. +In this model we store the model function and the observed data. + +```{julia} +struct MiniModel{F,D} <: AbstractMCMC.AbstractModel + f::F + data::D # a NamedTuple of all the data +end +``` + +In the Turing compiler, the model-specific `DynamicPPL.Model` is constructed automatically when calling the model function. +But for the sake of simplicity here we construct the model manually. + +To illustrate probabilistic inference with our mini language we implement an extremely simplistic Random-Walk Metropolis-Hastings sampler. +We hard-code the proposal step as part of the sampler and only allow normal distributions with zero mean and fixed standard deviation. +The Metropolis-Hastings sampler in Turing is more flexible. + +```{julia} +struct MHSampler{T<:Real} <: AbstractMCMC.AbstractSampler + sigma::T +end + +MHSampler() = MHSampler(1) + +function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) + sampler = context.sampler + old_value = varinfo.values[var_id] + + # propose a random-walk step, i.e, add the current value to a random + # value sampled from a Normal distribution centered at 0 + value = rand(context.rng, Normal(old_value, sampler.sigma)) + logp = Distributions.logpdf(dist, value) + varinfo[var_id] = (value, logp) + + return value +end; +``` + +We need to define two `step` functions, one for the first step and the other for the following steps. +In the first step we sample values from the prior distributions and in the following steps we sample with the random-walk proposal. +The two functions are identified by the different arguments they take. + +```{julia} +# The fist step: Sampling from the prior distributions +function AbstractMCMC.step( + rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... +) + vi = VarInfo() + ctx = SamplingContext(rng, PriorSampler()) + model.f(vi, ctx, values(model.data)...) + return vi, vi +end + +# The following steps: Sampling with random-walk proposal +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model::MiniModel, + sampler::MHSampler, + prev_state::VarInfo; # is just the old trace + kwargs..., +) + vi = prev_state + new_vi = deepcopy(vi) + ctx = SamplingContext(rng, sampler) + model.f(new_vi, ctx, values(model.data)...) + + # Compute log acceptance probability + # Since the proposal is symmetric the computation can be simplified + logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) + + # Accept proposal with computed acceptance probability + if -randexp(rng) < logα + return new_vi, new_vi + else + return prev_state, prev_state + end +end; +``` + +To make it easier to analyze the samples and compare them with results from Turing, additionally we define a version of `AbstractMCMC.bundle_samples` for our model and sampler that returns a `MCMCChains.Chains` object of samples. + +```{julia} +function AbstractMCMC.bundle_samples( + samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... +) + # We get a vector of traces + values = [sample.values for sample in samples] + params = [key for key in keys(values[1]) if key ∉ keys(model.data)] + vals = reduce(hcat, [value[p] for value in values] for p in params) + # Composing the `Chains` data-structure, of which analyzing infrastructure is provided + chains = Chains(vals, params) + return chains +end; +``` + +Let us check how our mini probabilistic programming language works. +We define the probabilistic model: + +```{julia} +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end; +``` + +We perform inference with data `x = 3.0`: + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(), 1_000_000; chain_type=Chains, progress=false) +``` + +We compare these results with Turing. + +```{julia} +using Turing +using PDMats + +@model function turing_m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end + +sample(turing_m(3.0), MH(ScalMat(2, 1.0)), 1_000_000, progress=false) +``` + +As you can see, with our simple probabilistic programming language and custom samplers we get similar results as Turing. diff --git a/developers/compiler/minituring-contexts/index.qmd b/developers/compiler/minituring-contexts/index.qmd index 6468483ef..cbad94380 100755 --- a/developers/compiler/minituring-contexts/index.qmd +++ b/developers/compiler/minituring-contexts/index.qmd @@ -1,306 +1,306 @@ ---- -title: "A Mini Turing Implementation II: Contexts" -engine: julia -aliases: - - ../../../tutorials/16-contexts/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In the [Mini Turing]({{< meta minituring >}}) tutorial we developed a miniature version of the Turing language, to illustrate its core design. A passing mention was made of contexts. In this tutorial we develop that aspect of our mini Turing language further to demonstrate how and why contexts are an important part of Turing's design. - -# Mini Turing expanded, now with more contexts - -If you haven't read [Mini Turing]({{< meta minituring >}}) yet, you should do that first. We start by repeating verbatim much of the code from there. Define the type for holding values for variables: - -```{julia} -import MacroTools, Random, AbstractMCMC -using Distributions: Normal, logpdf -using MCMCChains: Chains -using AbstractMCMC: sample - -struct VarInfo{V,L} - values::V - logps::L -end - -VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) - -function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) - varinfo.values[var_id] = value - varinfo.logps[var_id] = logp - return varinfo -end -``` - -Define the macro that expands `~` expressions to calls to `assume` and `observe`: - -```{julia} -# Methods will be defined for these later. -function assume end -function observe end - -macro mini_model(expr) - return esc(mini_model(expr)) -end - -function mini_model(expr) - # Split the function definition into a dictionary with its name, arguments, body etc. - def = MacroTools.splitdef(expr) - - # Replace tildes in the function body with calls to `assume` or `observe` - def[:body] = MacroTools.postwalk(def[:body]) do sub_expr - if MacroTools.@capture(sub_expr, var_ ~ dist_) - if var in def[:args] - # If the variable is an argument of the model function, it is observed - return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) - else - # Otherwise it is unobserved - return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) - end - else - return sub_expr - end - end - - # Add `context` and `varinfo` arguments to the model function - def[:args] = vcat(:varinfo, :context, def[:args]) - - # Reassemble the function definition from its name, arguments, body etc. - return MacroTools.combinedef(def) -end - - -struct MiniModel{F,D} <: AbstractMCMC.AbstractModel - f::F - data::D # a NamedTuple of all the data -end -``` - -Define an example model: - -```{julia} -@mini_model function m(x) - a ~ Normal(0.5, 1) - b ~ Normal(a, 2) - x ~ Normal(b, 0.5) - return nothing -end; - -mini_m = MiniModel(m, (x=3.0,)) -``` - -Previously in the mini Turing case, at this point we defined `SamplingContext`, a structure that holds a random number generator and a sampler, and gets passed to `observe` and `assume`. We then used it to implement a simple Metropolis-Hastings sampler. - -The notion of a context may have seemed overly complicated just to implement the sampler, but there are other things we may want to do with a model than sample from the posterior. Having the context passing in place lets us do that without having to touch the above macro at all. For instance, let's say we want to evaluate the log joint probability of the model for a given set of data and parameters. Using a new context type we can use the previously defined `model` function, but change its behavior by changing what the `observe` and `assume` functions do. - - - -```{julia} -struct JointContext end - -function observe(context::JointContext, varinfo, dist, var_id, var_value) - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return nothing -end - -function assume(context::JointContext, varinfo, dist, var_id) - if !haskey(varinfo.values, var_id) - error("Can't evaluate the log probability if the variable $(var_id) is not set.") - end - var_value = varinfo.values[var_id] - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return var_value -end - -function logjoint(model, parameter_values::NamedTuple) - vi = VarInfo() - for (var_id, value) in pairs(parameter_values) - # Set the log prob to NaN for now. These will get overwritten when model.f is - # called with JointContext. - vi[var_id] = (value, NaN) - end - model.f(vi, JointContext(), values(model.data)...) - return sum(values(vi.logps)) -end - -logjoint(mini_m, (a=0.5, b=1.0)) -``` - -When using the `JointContext` no sampling whatsoever happens in calling `mini_m`. Rather only the log probability of each given variable value is evaluated. `logjoint` then sums these results to get the total log joint probability. - -We can similarly define a context for evaluating the log prior probability: - -```{julia} -struct PriorContext end - -function observe(context::PriorContext, varinfo, dist, var_id, var_value) - # Since we are evaluating the prior, the log probability of all the observations - # is set to 0. This has the effect of ignoring the likelihood. - varinfo[var_id] = (var_value, 0.0) - return nothing -end - -function assume(context::PriorContext, varinfo, dist, var_id) - if !haskey(varinfo.values, var_id) - error("Can't evaluate the log probability if the variable $(var_id) is not set.") - end - var_value = varinfo.values[var_id] - logp = logpdf(dist, var_value) - varinfo[var_id] = (var_value, logp) - return var_value -end - -function logprior(model, parameter_values::NamedTuple) - vi = VarInfo() - for (var_id, value) in pairs(parameter_values) - vi[var_id] = (value, NaN) - end - model.f(vi, PriorContext(), values(model.data)...) - return sum(values(vi.logps)) -end - -logprior(mini_m, (a=0.5, b=1.0)) -``` - -Notice that the definition of `assume(context::PriorContext, args...)` is identical to the one for `JointContext`, and `logprior` and `logjoint` are also identical except for the context type they create. There's clearly an opportunity here for some refactoring using abstract types, but that's outside the scope of this tutorial. Rather, the point here is to demonstrate that we can extract different sorts of things from our model by defining different context types, and specialising `observe` and `assume` for them. - - -## Contexts within contexts - -Let's use the above two contexts to provide a slightly more general definition of the `SamplingContext` and the Metropolis-Hastings sampler we wrote in the mini Turing tutorial. - -```{julia} -struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} - rng::R - sampler::S - subcontext::Union{PriorContext, JointContext} -end -``` - -The new aspect here is the `subcontext` field. Note that this is a context within a context! The idea is that we don't need to hard code how the MCMC sampler evaluates the log probability, but rather can pass that work onto the subcontext. This way the same sampler can be used to sample from either the joint or the prior distribution. - -The methods for `SamplingContext` are largely as in the our earlier mini Turing case, except they now pass some of the work onto the subcontext: - -```{julia} -function observe(context::SamplingContext, args...) - # Sampling doesn't affect the observed values, so nothing to do here other than pass to - # the subcontext. - return observe(context.subcontext, args...) -end - -struct PriorSampler <: AbstractMCMC.AbstractSampler end - -function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) - sample = Random.rand(context.rng, dist) - varinfo[var_id] = (sample, NaN) - # Once the value has been sampled, let the subcontext handle evaluating the log - # probability. - return assume(context.subcontext, varinfo, dist, var_id) -end; - -# The subcontext field of the MHSampler determines which distribution this sampler -# samples from. -struct MHSampler{D, T<:Real} <: AbstractMCMC.AbstractSampler - sigma::T - subcontext::D -end - -MHSampler(subcontext) = MHSampler(1, subcontext) - -function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) - sampler = context.sampler - old_value = varinfo.values[var_id] - - # propose a random-walk step, i.e, add the current value to a random - # value sampled from a Normal distribution centered at 0 - value = rand(context.rng, Normal(old_value, sampler.sigma)) - varinfo[var_id] = (value, NaN) - # Once the value has been sampled, let the subcontext handle evaluating the log - # probability. - return assume(context.subcontext, varinfo, dist, var_id) -end; - -# The following three methods are identical to before, except for passing -# `sampler.subcontext` to the context SamplingContext. -function AbstractMCMC.step( - rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... -) - vi = VarInfo() - ctx = SamplingContext(rng, PriorSampler(), sampler.subcontext) - model.f(vi, ctx, values(model.data)...) - return vi, vi -end - -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model::MiniModel, - sampler::MHSampler, - prev_state::VarInfo; # is just the old trace - kwargs..., -) - vi = prev_state - new_vi = deepcopy(vi) - ctx = SamplingContext(rng, sampler, sampler.subcontext) - model.f(new_vi, ctx, values(model.data)...) - - # Compute log acceptance probability - # Since the proposal is symmetric the computation can be simplified - logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) - - # Accept proposal with computed acceptance probability - if -Random.randexp(rng) < logα - return new_vi, new_vi - else - return prev_state, prev_state - end -end; - -function AbstractMCMC.bundle_samples( - samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... -) - # We get a vector of traces - values = [sample.values for sample in samples] - params = [key for key in keys(values[1]) if key ∉ keys(model.data)] - vals = reduce(hcat, [value[p] for value in values] for p in params) - # Composing the `Chains` data-structure, of which analyzing infrastructure is provided - chains = Chains(vals, params) - return chains -end; -``` - -We can use this to sample from the joint distribution just like before: - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(JointContext()), 1_000_000; chain_type=Chains, progress=false) -``` - -or we can choose to sample from the prior instead - -```{julia} -sample(MiniModel(m, (x=3.0,)), MHSampler(PriorContext()), 1_000_000; chain_type=Chains, progress=false) -``` - -Of course, using an MCMC algorithm to sample from the prior is unnecessary and silly (`PriorSampler` exists, after all), but the point is to illustrate the flexibility of the context system. We could, for instance, use the same setup to implement an _Approximate Bayesian Computation_ (ABC) algorithm. - - -The use of contexts also goes far beyond just evaluating log probabilities and sampling. Some examples from Turing are - -* `FixedContext`, which fixes some variables to given values and removes them completely from the evaluation of any log probabilities. They power the `Turing.fix` and `Turing.unfix` functions. -* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.uncondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. - -* `PriorExtractorContext` collects information about what the prior distribution of each variable is. -* `PrefixContext` adds prefixes to variable names, allowing models to be used within other models without variable name collisions. -* `PointwiseLikelihoodContext` records the log likelihood of each individual variable. -* `DebugContext` collects useful debugging information while executing the model. - -All of the above are what Turing calls _parent contexts_, which is to say that they all keep a subcontext just like our above `SamplingContext` did. Their implementations of `assume` and `observe` call the implementation of the subcontext once they are done doing their own work of fixing/conditioning/prefixing/etc. Contexts are often chained, so that e.g. a `DebugContext` may wrap within it a `PrefixContext`, which may in turn wrap a `ConditionContext`, etc. The only contexts that _don't_ have a subcontext in the Turing are the ones for evaluating the prior, likelihood, and joint distributions. These are called _leaf contexts_. - -The above version of mini Turing is still much simpler than the full Turing language, but the principles of how contexts are used are the same. +--- +title: "A Mini Turing Implementation II: Contexts" +engine: julia +aliases: + - ../../../tutorials/16-contexts/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In the [Mini Turing]({{< meta minituring >}}) tutorial we developed a miniature version of the Turing language, to illustrate its core design. A passing mention was made of contexts. In this tutorial we develop that aspect of our mini Turing language further to demonstrate how and why contexts are an important part of Turing's design. + +# Mini Turing expanded, now with more contexts + +If you haven't read [Mini Turing]({{< meta minituring >}}) yet, you should do that first. We start by repeating verbatim much of the code from there. Define the type for holding values for variables: + +```{julia} +import MacroTools, Random, AbstractMCMC +using Distributions: Normal, logpdf +using MCMCChains: Chains +using AbstractMCMC: sample + +struct VarInfo{V,L} + values::V + logps::L +end + +VarInfo() = VarInfo(Dict{Symbol,Float64}(), Dict{Symbol,Float64}()) + +function Base.setindex!(varinfo::VarInfo, (value, logp), var_id) + varinfo.values[var_id] = value + varinfo.logps[var_id] = logp + return varinfo +end +``` + +Define the macro that expands `~` expressions to calls to `assume` and `observe`: + +```{julia} +# Methods will be defined for these later. +function assume end +function observe end + +macro mini_model(expr) + return esc(mini_model(expr)) +end + +function mini_model(expr) + # Split the function definition into a dictionary with its name, arguments, body etc. + def = MacroTools.splitdef(expr) + + # Replace tildes in the function body with calls to `assume` or `observe` + def[:body] = MacroTools.postwalk(def[:body]) do sub_expr + if MacroTools.@capture(sub_expr, var_ ~ dist_) + if var in def[:args] + # If the variable is an argument of the model function, it is observed + return :($(observe)(context, varinfo, $dist, $(Meta.quot(var)), $var)) + else + # Otherwise it is unobserved + return :($var = $(assume)(context, varinfo, $dist, $(Meta.quot(var)))) + end + else + return sub_expr + end + end + + # Add `context` and `varinfo` arguments to the model function + def[:args] = vcat(:varinfo, :context, def[:args]) + + # Reassemble the function definition from its name, arguments, body etc. + return MacroTools.combinedef(def) +end + + +struct MiniModel{F,D} <: AbstractMCMC.AbstractModel + f::F + data::D # a NamedTuple of all the data +end +``` + +Define an example model: + +```{julia} +@mini_model function m(x) + a ~ Normal(0.5, 1) + b ~ Normal(a, 2) + x ~ Normal(b, 0.5) + return nothing +end; + +mini_m = MiniModel(m, (x=3.0,)) +``` + +Previously in the mini Turing case, at this point we defined `SamplingContext`, a structure that holds a random number generator and a sampler, and gets passed to `observe` and `assume`. We then used it to implement a simple Metropolis-Hastings sampler. + +The notion of a context may have seemed overly complicated just to implement the sampler, but there are other things we may want to do with a model than sample from the posterior. Having the context passing in place lets us do that without having to touch the above macro at all. For instance, let's say we want to evaluate the log joint probability of the model for a given set of data and parameters. Using a new context type we can use the previously defined `model` function, but change its behavior by changing what the `observe` and `assume` functions do. + + + +```{julia} +struct JointContext end + +function observe(context::JointContext, varinfo, dist, var_id, var_value) + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return nothing +end + +function assume(context::JointContext, varinfo, dist, var_id) + if !haskey(varinfo.values, var_id) + error("Can't evaluate the log probability if the variable $(var_id) is not set.") + end + var_value = varinfo.values[var_id] + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return var_value +end + +function logjoint(model, parameter_values::NamedTuple) + vi = VarInfo() + for (var_id, value) in pairs(parameter_values) + # Set the log prob to NaN for now. These will get overwritten when model.f is + # called with JointContext. + vi[var_id] = (value, NaN) + end + model.f(vi, JointContext(), values(model.data)...) + return sum(values(vi.logps)) +end + +logjoint(mini_m, (a=0.5, b=1.0)) +``` + +When using the `JointContext` no sampling whatsoever happens in calling `mini_m`. Rather only the log probability of each given variable value is evaluated. `logjoint` then sums these results to get the total log joint probability. + +We can similarly define a context for evaluating the log prior probability: + +```{julia} +struct PriorContext end + +function observe(context::PriorContext, varinfo, dist, var_id, var_value) + # Since we are evaluating the prior, the log probability of all the observations + # is set to 0. This has the effect of ignoring the likelihood. + varinfo[var_id] = (var_value, 0.0) + return nothing +end + +function assume(context::PriorContext, varinfo, dist, var_id) + if !haskey(varinfo.values, var_id) + error("Can't evaluate the log probability if the variable $(var_id) is not set.") + end + var_value = varinfo.values[var_id] + logp = logpdf(dist, var_value) + varinfo[var_id] = (var_value, logp) + return var_value +end + +function logprior(model, parameter_values::NamedTuple) + vi = VarInfo() + for (var_id, value) in pairs(parameter_values) + vi[var_id] = (value, NaN) + end + model.f(vi, PriorContext(), values(model.data)...) + return sum(values(vi.logps)) +end + +logprior(mini_m, (a=0.5, b=1.0)) +``` + +Notice that the definition of `assume(context::PriorContext, args...)` is identical to the one for `JointContext`, and `logprior` and `logjoint` are also identical except for the context type they create. There's clearly an opportunity here for some refactoring using abstract types, but that's outside the scope of this tutorial. Rather, the point here is to demonstrate that we can extract different sorts of things from our model by defining different context types, and specialising `observe` and `assume` for them. + + +## Contexts within contexts + +Let's use the above two contexts to provide a slightly more general definition of the `SamplingContext` and the Metropolis-Hastings sampler we wrote in the mini Turing tutorial. + +```{julia} +struct SamplingContext{S<:AbstractMCMC.AbstractSampler,R<:Random.AbstractRNG} + rng::R + sampler::S + subcontext::Union{PriorContext, JointContext} +end +``` + +The new aspect here is the `subcontext` field. Note that this is a context within a context! The idea is that we don't need to hard code how the MCMC sampler evaluates the log probability, but rather can pass that work onto the subcontext. This way the same sampler can be used to sample from either the joint or the prior distribution. + +The methods for `SamplingContext` are largely as in the our earlier mini Turing case, except they now pass some of the work onto the subcontext: + +```{julia} +function observe(context::SamplingContext, args...) + # Sampling doesn't affect the observed values, so nothing to do here other than pass to + # the subcontext. + return observe(context.subcontext, args...) +end + +struct PriorSampler <: AbstractMCMC.AbstractSampler end + +function assume(context::SamplingContext{PriorSampler}, varinfo, dist, var_id) + sample = Random.rand(context.rng, dist) + varinfo[var_id] = (sample, NaN) + # Once the value has been sampled, let the subcontext handle evaluating the log + # probability. + return assume(context.subcontext, varinfo, dist, var_id) +end; + +# The subcontext field of the MHSampler determines which distribution this sampler +# samples from. +struct MHSampler{D, T<:Real} <: AbstractMCMC.AbstractSampler + sigma::T + subcontext::D +end + +MHSampler(subcontext) = MHSampler(1, subcontext) + +function assume(context::SamplingContext{<:MHSampler}, varinfo, dist, var_id) + sampler = context.sampler + old_value = varinfo.values[var_id] + + # propose a random-walk step, i.e, add the current value to a random + # value sampled from a Normal distribution centered at 0 + value = rand(context.rng, Normal(old_value, sampler.sigma)) + varinfo[var_id] = (value, NaN) + # Once the value has been sampled, let the subcontext handle evaluating the log + # probability. + return assume(context.subcontext, varinfo, dist, var_id) +end; + +# The following three methods are identical to before, except for passing +# `sampler.subcontext` to the context SamplingContext. +function AbstractMCMC.step( + rng::Random.AbstractRNG, model::MiniModel, sampler::MHSampler; kwargs... +) + vi = VarInfo() + ctx = SamplingContext(rng, PriorSampler(), sampler.subcontext) + model.f(vi, ctx, values(model.data)...) + return vi, vi +end + +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model::MiniModel, + sampler::MHSampler, + prev_state::VarInfo; # is just the old trace + kwargs..., +) + vi = prev_state + new_vi = deepcopy(vi) + ctx = SamplingContext(rng, sampler, sampler.subcontext) + model.f(new_vi, ctx, values(model.data)...) + + # Compute log acceptance probability + # Since the proposal is symmetric the computation can be simplified + logα = sum(values(new_vi.logps)) - sum(values(vi.logps)) + + # Accept proposal with computed acceptance probability + if -Random.randexp(rng) < logα + return new_vi, new_vi + else + return prev_state, prev_state + end +end; + +function AbstractMCMC.bundle_samples( + samples, model::MiniModel, ::MHSampler, ::Any, ::Type{Chains}; kwargs... +) + # We get a vector of traces + values = [sample.values for sample in samples] + params = [key for key in keys(values[1]) if key ∉ keys(model.data)] + vals = reduce(hcat, [value[p] for value in values] for p in params) + # Composing the `Chains` data-structure, of which analyzing infrastructure is provided + chains = Chains(vals, params) + return chains +end; +``` + +We can use this to sample from the joint distribution just like before: + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(JointContext()), 1_000_000; chain_type=Chains, progress=false) +``` + +or we can choose to sample from the prior instead + +```{julia} +sample(MiniModel(m, (x=3.0,)), MHSampler(PriorContext()), 1_000_000; chain_type=Chains, progress=false) +``` + +Of course, using an MCMC algorithm to sample from the prior is unnecessary and silly (`PriorSampler` exists, after all), but the point is to illustrate the flexibility of the context system. We could, for instance, use the same setup to implement an _Approximate Bayesian Computation_ (ABC) algorithm. + + +The use of contexts also goes far beyond just evaluating log probabilities and sampling. Some examples from Turing are + +* `FixedContext`, which fixes some variables to given values and removes them completely from the evaluation of any log probabilities. They power the `Turing.fix` and `Turing.unfix` functions. +* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.uncondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. + +* `PriorExtractorContext` collects information about what the prior distribution of each variable is. +* `PrefixContext` adds prefixes to variable names, allowing models to be used within other models without variable name collisions. +* `PointwiseLikelihoodContext` records the log likelihood of each individual variable. +* `DebugContext` collects useful debugging information while executing the model. + +All of the above are what Turing calls _parent contexts_, which is to say that they all keep a subcontext just like our above `SamplingContext` did. Their implementations of `assume` and `observe` call the implementation of the subcontext once they are done doing their own work of fixing/conditioning/prefixing/etc. Contexts are often chained, so that e.g. a `DebugContext` may wrap within it a `PrefixContext`, which may in turn wrap a `ConditionContext`, etc. The only contexts that _don't_ have a subcontext in the Turing are the ones for evaluating the prior, likelihood, and joint distributions. These are called _leaf contexts_. + +The above version of mini Turing is still much simpler than the full Turing language, but the principles of how contexts are used are the same. diff --git a/developers/contributing/index.qmd b/developers/contributing/index.qmd index 00040e7e2..e2e5b12aa 100755 --- a/developers/contributing/index.qmd +++ b/developers/contributing/index.qmd @@ -1,78 +1,78 @@ ---- -title: Contributing -aliases: - - ../../tutorials/docs-01-contributing-guide/index.html ---- - -Turing is an open-source project and is [hosted on GitHub](https://github.com/TuringLang). -We welcome contributions from the community in all forms large or small: bug reports, feature implementations, code contributions, or improvements to documentation or infrastructure are all extremely valuable. -We would also very much appreciate examples of models written using Turing. - -### How to get involved - -Our outstanding issues are tabulated on our [issue tracker](https://github.com/TuringLang/Turing.jl/issues). -Closing one of these may involve implementing new features, fixing bugs, or writing example models. - -You can also join the `#turing` channel on the [Julia Slack](https://julialang.org/slack/) and say hello! - -If you are new to open-source software, please see [GitHub's introduction](https://guides.github.com/introduction/flow/) or [Julia's contribution guide](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md) on using version control for collaboration. - -### Documentation - -Each of the packages in the Turing ecosystem (see [Libraries](/library)) has its own documentation, which is typically found in the `docs` folder of the corresponding package. -For example, the source code for DynamicPPL's documentation can be found in [its repository](https://github.com/TuringLang/DynamicPPL.jl). - -The documentation for Turing.jl itself consists of the tutorials that you see on this website, and is built from the separate [`docs` repository](https://github.com/TuringLang/docs). -None of the documentation is generated from the [main Turing.jl repository](https://github.com/TuringLang/Turing.jl); in particular, the API that Turing exports does not currently form part of the documentation. - -Other sections of the website (anything that isn't a package, or a tutorial) – for example, the list of libraries – is built from the [`turinglang.github.io` repository](https://github.com/TuringLang/turinglang.github.io). - -### Tests - -Turing, like most software libraries, has a test suite. You can run the whole suite by running `julia --project=.` from the root of the Turing repository, and then running - -```julia -import Pkg; Pkg.test("Turing") -``` - -The test suite subdivides into files in the `test` folder, and you can run only some of them using commands like - -```julia -import Pkg; Pkg.test("Turing"; test_args=["optim", "hmc", "--skip", "ext"]) -``` - -This one would run all files with "optim" or "hmc" in their path, such as `test/optimisation/Optimisation.jl`, but not files with "ext" in their path. Alternatively, you can set these arguments as command line arguments when you run Julia - -```julia -julia --project=. -e 'import Pkg; Pkg.test(; test_args=ARGS)' -- optim hmc --skip ext -``` - -Or otherwise, set the global `ARGS` variable, and call `include("test/runtests.jl")`. - -### Style Guide - -Turing has a style guide, described below. -Reviewing it before making a pull request is not strictly necessary, but you may be asked to change portions of your code to conform with the style guide before it is merged. - -Most Turing code follows [Blue: a Style Guide for Julia](https://github.com/JuliaDiff/BlueStyle). -These conventions were created from a variety of sources including Python's [PEP8](http://legacy.python.org/dev/peps/pep-0008/), Julia's [Notes for Contributors](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md), and Julia's [Style Guide](https://docs.julialang.org/en/v1/manual/style-guide/). - -#### Synopsis - - - Use 4 spaces per indentation level, no tabs. - - Try to adhere to a 92 character line length limit. - - Use upper camel case convention for [modules](https://docs.julialang.org/en/v1/manual/modules/) and [types](https://docs.julialang.org/en/v1/manual/types/). - - Use lower case with underscores for method names (note: Julia code likes to use lower case without underscores). - - Comments are good, try to explain the intentions of the code. - - Use whitespace to make the code more readable. - - No whitespace at the end of a line (trailing whitespace). - - Avoid padding brackets with spaces. ex. `Int64(value)` preferred over `Int64( value )`. - -#### A Word on Consistency - -When adhering to the Blue style, it's important to realize that these are guidelines, not rules. This is [stated best in the PEP8](http://legacy.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds): - -> A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is most important. - -> But most importantly: know when to be inconsistent – sometimes the style guide just doesn't apply. When in doubt, use your best judgment. Look at other examples and decide what looks best. And don't hesitate to ask! - +--- +title: Contributing +aliases: + - ../../tutorials/docs-01-contributing-guide/index.html +--- + +Turing is an open-source project and is [hosted on GitHub](https://github.com/TuringLang). +We welcome contributions from the community in all forms large or small: bug reports, feature implementations, code contributions, or improvements to documentation or infrastructure are all extremely valuable. +We would also very much appreciate examples of models written using Turing. + +### How to get involved + +Our outstanding issues are tabulated on our [issue tracker](https://github.com/TuringLang/Turing.jl/issues). +Closing one of these may involve implementing new features, fixing bugs, or writing example models. + +You can also join the `#turing` channel on the [Julia Slack](https://julialang.org/slack/) and say hello! + +If you are new to open-source software, please see [GitHub's introduction](https://guides.github.com/introduction/flow/) or [Julia's contribution guide](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md) on using version control for collaboration. + +### Documentation + +Each of the packages in the Turing ecosystem (see [Libraries](/library)) has its own documentation, which is typically found in the `docs` folder of the corresponding package. +For example, the source code for DynamicPPL's documentation can be found in [its repository](https://github.com/TuringLang/DynamicPPL.jl). + +The documentation for Turing.jl itself consists of the tutorials that you see on this website, and is built from the separate [`docs` repository](https://github.com/TuringLang/docs). +None of the documentation is generated from the [main Turing.jl repository](https://github.com/TuringLang/Turing.jl); in particular, the API that Turing exports does not currently form part of the documentation. + +Other sections of the website (anything that isn't a package, or a tutorial) – for example, the list of libraries – is built from the [`turinglang.github.io` repository](https://github.com/TuringLang/turinglang.github.io). + +### Tests + +Turing, like most software libraries, has a test suite. You can run the whole suite by running `julia --project=.` from the root of the Turing repository, and then running + +```julia +import Pkg; Pkg.test("Turing") +``` + +The test suite subdivides into files in the `test` folder, and you can run only some of them using commands like + +```julia +import Pkg; Pkg.test("Turing"; test_args=["optim", "hmc", "--skip", "ext"]) +``` + +This one would run all files with "optim" or "hmc" in their path, such as `test/optimisation/Optimisation.jl`, but not files with "ext" in their path. Alternatively, you can set these arguments as command line arguments when you run Julia + +```julia +julia --project=. -e 'import Pkg; Pkg.test(; test_args=ARGS)' -- optim hmc --skip ext +``` + +Or otherwise, set the global `ARGS` variable, and call `include("test/runtests.jl")`. + +### Style Guide + +Turing has a style guide, described below. +Reviewing it before making a pull request is not strictly necessary, but you may be asked to change portions of your code to conform with the style guide before it is merged. + +Most Turing code follows [Blue: a Style Guide for Julia](https://github.com/JuliaDiff/BlueStyle). +These conventions were created from a variety of sources including Python's [PEP8](http://legacy.python.org/dev/peps/pep-0008/), Julia's [Notes for Contributors](https://github.com/JuliaLang/julia/blob/master/CONTRIBUTING.md), and Julia's [Style Guide](https://docs.julialang.org/en/v1/manual/style-guide/). + +#### Synopsis + + - Use 4 spaces per indentation level, no tabs. + - Try to adhere to a 92 character line length limit. + - Use upper camel case convention for [modules](https://docs.julialang.org/en/v1/manual/modules/) and [types](https://docs.julialang.org/en/v1/manual/types/). + - Use lower case with underscores for method names (note: Julia code likes to use lower case without underscores). + - Comments are good, try to explain the intentions of the code. + - Use whitespace to make the code more readable. + - No whitespace at the end of a line (trailing whitespace). + - Avoid padding brackets with spaces. ex. `Int64(value)` preferred over `Int64( value )`. + +#### A Word on Consistency + +When adhering to the Blue style, it's important to realize that these are guidelines, not rules. This is [stated best in the PEP8](http://legacy.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds): + +> A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is most important. + +> But most importantly: know when to be inconsistent – sometimes the style guide just doesn't apply. When in doubt, use your best judgment. Look at other examples and decide what looks best. And don't hesitate to ask! + diff --git a/developers/inference/abstractmcmc-interface/index.qmd b/developers/inference/abstractmcmc-interface/index.qmd index 993936209..aa8cfc210 100755 --- a/developers/inference/abstractmcmc-interface/index.qmd +++ b/developers/inference/abstractmcmc-interface/index.qmd @@ -1,323 +1,323 @@ ---- -title: Interface Guide -engine: julia -aliases: - - ../../tutorials/docs-06-for-developers-interface/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -# The sampling interface - -Turing implements a sampling interface (hosted at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl)) that is intended to provide a common framework for Markov chain Monte Carlo samplers. The interface presents several structures and functions that one needs to overload in order to implement an interface-compatible sampler. - -This guide will demonstrate how to implement the interface without Turing. - -## Interface overview - -Any implementation of an inference method that uses the AbstractMCMC interface should implement a subset of the following types and functions: - -1. A subtype of `AbstractSampler`, defined as a mutable struct containing state information or sampler parameters. -2. A function `sample_init!` which performs any necessary set-up (default: do not perform any set-up). -3. A function `step!` which returns a transition that represents a single draw from the sampler. -4. A function `transitions_init` which returns a container for the transitions obtained from the sampler (default: return a `Vector{T}` of length `N` where `T` is the type of the transition obtained in the first step and `N` is the number of requested samples). -5. A function `transitions_save!` which saves transitions to the container (default: save the transition of iteration `i` at position `i` in the vector of transitions). -6. A function `sample_end!` which handles any sampler wrap-up (default: do not perform any wrap-up). -7. A function `bundle_samples` which accepts the container of transitions and returns a collection of samples (default: return the vector of transitions). - -The interface methods with exclamation points are those that are intended to allow for state mutation. Any mutating function is meant to allow mutation where needed -- you might use: - -- `sample_init!` to run some kind of sampler preparation, before sampling begins. This could mutate a sampler's state. -- `step!` might mutate a sampler flag after each sample. -- `sample_end!` contains any wrap-up you might need to do. If you were sampling in a transformed space, this might be where you convert everything back to a constrained space. - -## Why do you have an interface? - -The motivation for the interface is to allow Julia's fantastic probabilistic programming language community to have a set of standards and common implementations so we can all thrive together. Markov chain Monte Carlo methods tend to have a very similar framework to one another, and so a common interface should help more great inference methods built in single-purpose packages to experience more use among the community. - -## Implementing Metropolis-Hastings without Turing - -[Metropolis-Hastings](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo) is often the first sampling method that people are exposed to. It is a very straightforward algorithm and is accordingly the easiest to implement, so it makes for a good example. In this section, you will learn how to use the types and functions listed above to implement the Metropolis-Hastings sampler using the MCMC interface. - -The full code for this implementation is housed in [AdvancedMH.jl](https://github.com/TuringLang/AdvancedMH.jl). - -### Imports - -Let's begin by importing the relevant libraries. We'll import `AbstractMCMC`, which contains the interface framework we'll fill out. We also need `Distributions` and `Random`. - -```{julia} -# Import the relevant libraries. -using AbstractMCMC: AbstractMCMC -using Distributions -using Random -``` - -An interface extension (like the one we're writing right now) typically requires that you overload or implement several functions. Specifically, you should `import` the functions you intend to overload. This next code block accomplishes that. - -From `Distributions`, we need `Sampleable`, `VariateForm`, and `ValueSupport`, three abstract types that define a distribution. Models in the interface are assumed to be subtypes of `Sampleable{VariateForm, ValueSupport}`. In this section our model is going be be extremely simple, so we will not end up using these except to make sure that the inference functions are dispatching correctly. - -### Sampler - -Let's begin our sampler definition by defining a sampler called `MetropolisHastings` which is a subtype of `AbstractSampler`. Correct typing is very important for proper interface implementation -- if you are missing a subtype, your method may not be dispatched to when you call `sample`. - -```{julia} -# Define a sampler type. -struct MetropolisHastings{T,D} <: AbstractMCMC.AbstractSampler - init_θ::T - proposal::D -end - -# Default constructors. -MetropolisHastings(init_θ::Real) = MetropolisHastings(init_θ, Normal(0, 1)) -function MetropolisHastings(init_θ::Vector{<:Real}) - return MetropolisHastings(init_θ, MvNormal(zero(init_θ), I)) -end -``` - -Above, we have defined a sampler that stores the initial parameterization of the prior, and a distribution object from which proposals are drawn. You can have a struct that has no fields, and simply use it for dispatching onto the relevant functions, or you can store a large amount of state information in your sampler. - -The general intuition for what to store in your sampler struct is that anything you may need to perform inference between samples but you don't want to store in a transition should go into the sampler struct. It's the only way you can carry non-sample related state information between `step!` calls. - -### Model - -Next, we need to have a model of some kind. A model is a struct that's a subtype of `AbstractModel` that contains whatever information is necessary to perform inference on your problem. In our case we want to know the mean and variance parameters for a standard Normal distribution, so we can keep our model to the log density of a Normal. - -Note that we only have to do this because we are not yet integrating the sampler with Turing -- Turing has a very sophisticated modelling engine that removes the need to define custom model structs. - -```{julia} -# Define a model type. Stores the log density function. -struct DensityModel{F<:Function} <: AbstractMCMC.AbstractModel - ℓπ::F -end -``` - -### Transition - -The next step is to define some transition which we will return from each `step!` call. We'll keep it simple by just defining a wrapper struct that contains the parameter draws and the log density of that draw: - -```{julia} -# Create a very basic Transition type, only stores the -# parameter draws and the log probability of the draw. -struct Transition{T,L} - θ::T - lp::L -end - -# Store the new draw and its log density. -Transition(model::DensityModel, θ) = Transition(θ, ℓπ(model, θ)) -``` - -`Transition` can now store any type of parameter, whether it's a vector of draws from multiple parameters or a single univariate draw. - -### Metropolis-Hastings - -Now it's time to get into the actual inference. We've defined all of the core pieces we need, but we need to implement the `step!` function which actually performs inference. - -As a refresher, Metropolis-Hastings implements a very basic algorithm: - -1. Pick some initial state, ``\theta_0``. - -2. For ``t`` in ``[1,N],`` do - - + Generate a proposal parameterization ``\theta^\prime_t \sim q(\theta^\prime_t \mid \theta_{t-1}).`` - - + Calculate the acceptance probability, ``\alpha = \text{min}\left[1,\frac{\pi(\theta'_t)}{\pi(\theta_{t-1})} \frac{q(\theta_{t-1} \mid \theta'_t)}{q(\theta'_t \mid \theta_{t-1})}) \right].`` - - + If ``U \le \alpha`` where ``U \sim [0,1],`` then ``\theta_t = \theta'_t.`` Otherwise, ``\theta_t = \theta_{t-1}.`` - -Of course, it's much easier to do this in the log space, so the acceptance probability is more commonly written as - -```{.cell-bg} -\log \alpha = \min\left[0, \log \pi(\theta'_t) - \log \pi(\theta_{t-1}) + \log q(\theta_{t-1} \mid \theta^\prime_t) - \log q(\theta\prime_t \mid \theta_{t-1}) \right]. -``` - -In interface terms, we should do the following: - -1. Make a new transition containing a proposed sample. -2. Calculate the acceptance probability. -3. If we accept, return the new transition, otherwise, return the old one. - -### Steps - -The `step!` function is the function that performs the bulk of your inference. In our case, we will implement two `step!` functions -- one for the very first iteration, and one for every subsequent iteration. - -```{julia} -#| eval: false -# Define the first step! function, which is called at the -# beginning of sampling. Return the initial parameter used -# to define the sampler. -function AbstractMCMC.step!( - rng::AbstractRNG, - model::DensityModel, - spl::MetropolisHastings, - N::Integer, - ::Nothing; - kwargs..., -) - return Transition(model, spl.init_θ) -end -``` - -The first `step!` function just packages up the initial parameterization inside the sampler, and returns it. We implicitly accept the very first parameterization. - -The other `step!` function performs the usual steps from Metropolis-Hastings. Included are several helper functions, `proposal` and `q`, which are designed to replicate the functions in the pseudocode above. - -- `proposal` generates a new proposal in the form of a `Transition`, which can be univariate if the value passed in is univariate, or it can be multivariate if the `Transition` given is multivariate. Proposals use a basic `Normal` or `MvNormal` proposal distribution. -- `q` returns the log density of one parameterization conditional on another, according to the proposal distribution. -- `step!` generates a new proposal, checks the acceptance probability, and then returns either the previous transition or the proposed transition. - - -```{julia} -#| eval: false -# Define a function that makes a basic proposal depending on a univariate -# parameterization or a multivariate parameterization. -function propose(spl::MetropolisHastings, model::DensityModel, θ::Real) - return Transition(model, θ + rand(spl.proposal)) -end -function propose(spl::MetropolisHastings, model::DensityModel, θ::Vector{<:Real}) - return Transition(model, θ + rand(spl.proposal)) -end -function propose(spl::MetropolisHastings, model::DensityModel, t::Transition) - return propose(spl, model, t.θ) -end - -# Calculates the probability `q(θ|θcond)`, using the proposal distribution `spl.proposal`. -q(spl::MetropolisHastings, θ::Real, θcond::Real) = logpdf(spl.proposal, θ - θcond) -function q(spl::MetropolisHastings, θ::Vector{<:Real}, θcond::Vector{<:Real}) - return logpdf(spl.proposal, θ - θcond) -end -q(spl::MetropolisHastings, t1::Transition, t2::Transition) = q(spl, t1.θ, t2.θ) - -# Calculate the density of the model given some parameterization. -ℓπ(model::DensityModel, θ) = model.ℓπ(θ) -ℓπ(model::DensityModel, t::Transition) = t.lp - -# Define the other step function. Returns a Transition containing -# either a new proposal (if accepted) or the previous proposal -# (if not accepted). -function AbstractMCMC.step!( - rng::AbstractRNG, - model::DensityModel, - spl::MetropolisHastings, - ::Integer, - θ_prev::Transition; - kwargs..., -) - # Generate a new proposal. - θ = propose(spl, model, θ_prev) - - # Calculate the log acceptance probability. - α = ℓπ(model, θ) - ℓπ(model, θ_prev) + q(spl, θ_prev, θ) - q(spl, θ, θ_prev) - - # Decide whether to return the previous θ or the new one. - if log(rand(rng)) < min(α, 0.0) - return θ - else - return θ_prev - end -end -``` - -### Chains - -In the default implementation, `sample` just returns a vector of all transitions. If instead you would like to obtain a `Chains` object (e.g., to simplify downstream analysis), you have to implement the `bundle_samples` function as well. It accepts the vector of transitions and returns a collection of samples. Fortunately, our `Transition` is incredibly simple, and we only need to build a little bit of functionality to accept custom parameter names passed in by the user. - -```{julia} -#| eval: false -# A basic chains constructor that works with the Transition struct we defined. -function AbstractMCMC.bundle_samples( - rng::AbstractRNG, - ℓ::DensityModel, - s::MetropolisHastings, - N::Integer, - ts::Vector{<:Transition}, - chain_type::Type{Any}; - param_names=missing, - kwargs..., -) - # Turn all the transitions into a vector-of-vectors. - vals = copy(reduce(hcat, [vcat(t.θ, t.lp) for t in ts])') - - # Check if we received any parameter names. - if ismissing(param_names) - param_names = ["Parameter $i" for i in 1:(length(first(vals)) - 1)] - end - - # Add the log density field to the parameter names. - push!(param_names, "lp") - - # Bundle everything up and return a Chains struct. - return Chains(vals, param_names, (internals=["lp"],)) -end -``` - -All done! - -You can even implement different output formats by implementing `bundle_samples` for different `chain_type`s, which can be provided as keyword argument to `sample`. As default `sample` uses `chain_type = Any`. - -### Testing the implementation - -Now that we have all the pieces, we should test the implementation by defining a model to calculate the mean and variance parameters of a Normal distribution. We can do this by constructing a target density function, providing a sample of data, and then running the sampler with `sample`. - -```{julia} -#| eval: false -# Generate a set of data from the posterior we want to estimate. -data = rand(Normal(5, 3), 30) - -# Define the components of a basic model. -insupport(θ) = θ[2] >= 0 -dist(θ) = Normal(θ[1], θ[2]) -density(θ) = insupport(θ) ? sum(logpdf.(dist(θ), data)) : -Inf - -# Construct a DensityModel. -model = DensityModel(density) - -# Set up our sampler with initial parameters. -spl = MetropolisHastings([0.0, 0.0]) - -# Sample from the posterior. -chain = sample(model, spl, 100000; param_names=["μ", "σ"]) -``` - -If all the interface functions have been extended properly, you should get an output from `display(chain)` that looks something like this: - - -```{.cell-bg} -Object of type Chains, with data of type 100000×3×1 Array{Float64,3} - -Iterations = 1:100000 -Thinning interval = 1 -Chains = 1 -Samples per chain = 100000 -internals = lp -parameters = μ, σ - -2-element Array{ChainDataFrame,1} - -Summary Statistics - -│ Row │ parameters │ mean │ std │ naive_se │ mcse │ ess │ r_hat │ -│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Any │ Any │ -├─────┼────────────┼─────────┼──────────┼────────────┼────────────┼─────────┼─────────┤ -│ 1 │ μ │ 5.33157 │ 0.854193 │ 0.0027012 │ 0.00893069 │ 8344.75 │ 1.00009 │ -│ 2 │ σ │ 4.54992 │ 0.632916 │ 0.00200146 │ 0.00534942 │ 14260.8 │ 1.00005 │ - -Quantiles - -│ Row │ parameters │ 2.5% │ 25.0% │ 50.0% │ 75.0% │ 97.5% │ -│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ -├─────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ -│ 1 │ μ │ 3.6595 │ 4.77754 │ 5.33182 │ 5.89509 │ 6.99651 │ -│ 2 │ σ │ 3.5097 │ 4.09732 │ 4.47805 │ 4.93094 │ 5.96821 │ -``` - -It looks like we're extremely close to our true parameters of `Normal(5,3)`, though with a fairly high variance due to the low sample size. - -## Conclusion - -We've seen how to implement the sampling interface for general projects. Turing's interface methods are ever-evolving, so please open an issue at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) with feature requests or problems. +--- +title: Interface Guide +engine: julia +aliases: + - ../../tutorials/docs-06-for-developers-interface/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +# The sampling interface + +Turing implements a sampling interface (hosted at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl)) that is intended to provide a common framework for Markov chain Monte Carlo samplers. The interface presents several structures and functions that one needs to overload in order to implement an interface-compatible sampler. + +This guide will demonstrate how to implement the interface without Turing. + +## Interface overview + +Any implementation of an inference method that uses the AbstractMCMC interface should implement a subset of the following types and functions: + +1. A subtype of `AbstractSampler`, defined as a mutable struct containing state information or sampler parameters. +2. A function `sample_init!` which performs any necessary set-up (default: do not perform any set-up). +3. A function `step!` which returns a transition that represents a single draw from the sampler. +4. A function `transitions_init` which returns a container for the transitions obtained from the sampler (default: return a `Vector{T}` of length `N` where `T` is the type of the transition obtained in the first step and `N` is the number of requested samples). +5. A function `transitions_save!` which saves transitions to the container (default: save the transition of iteration `i` at position `i` in the vector of transitions). +6. A function `sample_end!` which handles any sampler wrap-up (default: do not perform any wrap-up). +7. A function `bundle_samples` which accepts the container of transitions and returns a collection of samples (default: return the vector of transitions). + +The interface methods with exclamation points are those that are intended to allow for state mutation. Any mutating function is meant to allow mutation where needed -- you might use: + +- `sample_init!` to run some kind of sampler preparation, before sampling begins. This could mutate a sampler's state. +- `step!` might mutate a sampler flag after each sample. +- `sample_end!` contains any wrap-up you might need to do. If you were sampling in a transformed space, this might be where you convert everything back to a constrained space. + +## Why do you have an interface? + +The motivation for the interface is to allow Julia's fantastic probabilistic programming language community to have a set of standards and common implementations so we can all thrive together. Markov chain Monte Carlo methods tend to have a very similar framework to one another, and so a common interface should help more great inference methods built in single-purpose packages to experience more use among the community. + +## Implementing Metropolis-Hastings without Turing + +[Metropolis-Hastings](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo) is often the first sampling method that people are exposed to. It is a very straightforward algorithm and is accordingly the easiest to implement, so it makes for a good example. In this section, you will learn how to use the types and functions listed above to implement the Metropolis-Hastings sampler using the MCMC interface. + +The full code for this implementation is housed in [AdvancedMH.jl](https://github.com/TuringLang/AdvancedMH.jl). + +### Imports + +Let's begin by importing the relevant libraries. We'll import `AbstractMCMC`, which contains the interface framework we'll fill out. We also need `Distributions` and `Random`. + +```{julia} +# Import the relevant libraries. +using AbstractMCMC: AbstractMCMC +using Distributions +using Random +``` + +An interface extension (like the one we're writing right now) typically requires that you overload or implement several functions. Specifically, you should `import` the functions you intend to overload. This next code block accomplishes that. + +From `Distributions`, we need `Sampleable`, `VariateForm`, and `ValueSupport`, three abstract types that define a distribution. Models in the interface are assumed to be subtypes of `Sampleable{VariateForm, ValueSupport}`. In this section our model is going be be extremely simple, so we will not end up using these except to make sure that the inference functions are dispatching correctly. + +### Sampler + +Let's begin our sampler definition by defining a sampler called `MetropolisHastings` which is a subtype of `AbstractSampler`. Correct typing is very important for proper interface implementation -- if you are missing a subtype, your method may not be dispatched to when you call `sample`. + +```{julia} +# Define a sampler type. +struct MetropolisHastings{T,D} <: AbstractMCMC.AbstractSampler + init_θ::T + proposal::D +end + +# Default constructors. +MetropolisHastings(init_θ::Real) = MetropolisHastings(init_θ, Normal(0, 1)) +function MetropolisHastings(init_θ::Vector{<:Real}) + return MetropolisHastings(init_θ, MvNormal(zero(init_θ), I)) +end +``` + +Above, we have defined a sampler that stores the initial parameterization of the prior, and a distribution object from which proposals are drawn. You can have a struct that has no fields, and simply use it for dispatching onto the relevant functions, or you can store a large amount of state information in your sampler. + +The general intuition for what to store in your sampler struct is that anything you may need to perform inference between samples but you don't want to store in a transition should go into the sampler struct. It's the only way you can carry non-sample related state information between `step!` calls. + +### Model + +Next, we need to have a model of some kind. A model is a struct that's a subtype of `AbstractModel` that contains whatever information is necessary to perform inference on your problem. In our case we want to know the mean and variance parameters for a standard Normal distribution, so we can keep our model to the log density of a Normal. + +Note that we only have to do this because we are not yet integrating the sampler with Turing -- Turing has a very sophisticated modelling engine that removes the need to define custom model structs. + +```{julia} +# Define a model type. Stores the log density function. +struct DensityModel{F<:Function} <: AbstractMCMC.AbstractModel + ℓπ::F +end +``` + +### Transition + +The next step is to define some transition which we will return from each `step!` call. We'll keep it simple by just defining a wrapper struct that contains the parameter draws and the log density of that draw: + +```{julia} +# Create a very basic Transition type, only stores the +# parameter draws and the log probability of the draw. +struct Transition{T,L} + θ::T + lp::L +end + +# Store the new draw and its log density. +Transition(model::DensityModel, θ) = Transition(θ, ℓπ(model, θ)) +``` + +`Transition` can now store any type of parameter, whether it's a vector of draws from multiple parameters or a single univariate draw. + +### Metropolis-Hastings + +Now it's time to get into the actual inference. We've defined all of the core pieces we need, but we need to implement the `step!` function which actually performs inference. + +As a refresher, Metropolis-Hastings implements a very basic algorithm: + +1. Pick some initial state, ``\theta_0``. + +2. For ``t`` in ``[1,N],`` do + + + Generate a proposal parameterization ``\theta^\prime_t \sim q(\theta^\prime_t \mid \theta_{t-1}).`` + + + Calculate the acceptance probability, ``\alpha = \text{min}\left[1,\frac{\pi(\theta'_t)}{\pi(\theta_{t-1})} \frac{q(\theta_{t-1} \mid \theta'_t)}{q(\theta'_t \mid \theta_{t-1})}) \right].`` + + + If ``U \le \alpha`` where ``U \sim [0,1],`` then ``\theta_t = \theta'_t.`` Otherwise, ``\theta_t = \theta_{t-1}.`` + +Of course, it's much easier to do this in the log space, so the acceptance probability is more commonly written as + +```{.cell-bg} +\log \alpha = \min\left[0, \log \pi(\theta'_t) - \log \pi(\theta_{t-1}) + \log q(\theta_{t-1} \mid \theta^\prime_t) - \log q(\theta\prime_t \mid \theta_{t-1}) \right]. +``` + +In interface terms, we should do the following: + +1. Make a new transition containing a proposed sample. +2. Calculate the acceptance probability. +3. If we accept, return the new transition, otherwise, return the old one. + +### Steps + +The `step!` function is the function that performs the bulk of your inference. In our case, we will implement two `step!` functions -- one for the very first iteration, and one for every subsequent iteration. + +```{julia} +#| eval: false +# Define the first step! function, which is called at the +# beginning of sampling. Return the initial parameter used +# to define the sampler. +function AbstractMCMC.step!( + rng::AbstractRNG, + model::DensityModel, + spl::MetropolisHastings, + N::Integer, + ::Nothing; + kwargs..., +) + return Transition(model, spl.init_θ) +end +``` + +The first `step!` function just packages up the initial parameterization inside the sampler, and returns it. We implicitly accept the very first parameterization. + +The other `step!` function performs the usual steps from Metropolis-Hastings. Included are several helper functions, `proposal` and `q`, which are designed to replicate the functions in the pseudocode above. + +- `proposal` generates a new proposal in the form of a `Transition`, which can be univariate if the value passed in is univariate, or it can be multivariate if the `Transition` given is multivariate. Proposals use a basic `Normal` or `MvNormal` proposal distribution. +- `q` returns the log density of one parameterization conditional on another, according to the proposal distribution. +- `step!` generates a new proposal, checks the acceptance probability, and then returns either the previous transition or the proposed transition. + + +```{julia} +#| eval: false +# Define a function that makes a basic proposal depending on a univariate +# parameterization or a multivariate parameterization. +function propose(spl::MetropolisHastings, model::DensityModel, θ::Real) + return Transition(model, θ + rand(spl.proposal)) +end +function propose(spl::MetropolisHastings, model::DensityModel, θ::Vector{<:Real}) + return Transition(model, θ + rand(spl.proposal)) +end +function propose(spl::MetropolisHastings, model::DensityModel, t::Transition) + return propose(spl, model, t.θ) +end + +# Calculates the probability `q(θ|θcond)`, using the proposal distribution `spl.proposal`. +q(spl::MetropolisHastings, θ::Real, θcond::Real) = logpdf(spl.proposal, θ - θcond) +function q(spl::MetropolisHastings, θ::Vector{<:Real}, θcond::Vector{<:Real}) + return logpdf(spl.proposal, θ - θcond) +end +q(spl::MetropolisHastings, t1::Transition, t2::Transition) = q(spl, t1.θ, t2.θ) + +# Calculate the density of the model given some parameterization. +ℓπ(model::DensityModel, θ) = model.ℓπ(θ) +ℓπ(model::DensityModel, t::Transition) = t.lp + +# Define the other step function. Returns a Transition containing +# either a new proposal (if accepted) or the previous proposal +# (if not accepted). +function AbstractMCMC.step!( + rng::AbstractRNG, + model::DensityModel, + spl::MetropolisHastings, + ::Integer, + θ_prev::Transition; + kwargs..., +) + # Generate a new proposal. + θ = propose(spl, model, θ_prev) + + # Calculate the log acceptance probability. + α = ℓπ(model, θ) - ℓπ(model, θ_prev) + q(spl, θ_prev, θ) - q(spl, θ, θ_prev) + + # Decide whether to return the previous θ or the new one. + if log(rand(rng)) < min(α, 0.0) + return θ + else + return θ_prev + end +end +``` + +### Chains + +In the default implementation, `sample` just returns a vector of all transitions. If instead you would like to obtain a `Chains` object (e.g., to simplify downstream analysis), you have to implement the `bundle_samples` function as well. It accepts the vector of transitions and returns a collection of samples. Fortunately, our `Transition` is incredibly simple, and we only need to build a little bit of functionality to accept custom parameter names passed in by the user. + +```{julia} +#| eval: false +# A basic chains constructor that works with the Transition struct we defined. +function AbstractMCMC.bundle_samples( + rng::AbstractRNG, + ℓ::DensityModel, + s::MetropolisHastings, + N::Integer, + ts::Vector{<:Transition}, + chain_type::Type{Any}; + param_names=missing, + kwargs..., +) + # Turn all the transitions into a vector-of-vectors. + vals = copy(reduce(hcat, [vcat(t.θ, t.lp) for t in ts])') + + # Check if we received any parameter names. + if ismissing(param_names) + param_names = ["Parameter $i" for i in 1:(length(first(vals)) - 1)] + end + + # Add the log density field to the parameter names. + push!(param_names, "lp") + + # Bundle everything up and return a Chains struct. + return Chains(vals, param_names, (internals=["lp"],)) +end +``` + +All done! + +You can even implement different output formats by implementing `bundle_samples` for different `chain_type`s, which can be provided as keyword argument to `sample`. As default `sample` uses `chain_type = Any`. + +### Testing the implementation + +Now that we have all the pieces, we should test the implementation by defining a model to calculate the mean and variance parameters of a Normal distribution. We can do this by constructing a target density function, providing a sample of data, and then running the sampler with `sample`. + +```{julia} +#| eval: false +# Generate a set of data from the posterior we want to estimate. +data = rand(Normal(5, 3), 30) + +# Define the components of a basic model. +insupport(θ) = θ[2] >= 0 +dist(θ) = Normal(θ[1], θ[2]) +density(θ) = insupport(θ) ? sum(logpdf.(dist(θ), data)) : -Inf + +# Construct a DensityModel. +model = DensityModel(density) + +# Set up our sampler with initial parameters. +spl = MetropolisHastings([0.0, 0.0]) + +# Sample from the posterior. +chain = sample(model, spl, 100000; param_names=["μ", "σ"]) +``` + +If all the interface functions have been extended properly, you should get an output from `display(chain)` that looks something like this: + + +```{.cell-bg} +Object of type Chains, with data of type 100000×3×1 Array{Float64,3} + +Iterations = 1:100000 +Thinning interval = 1 +Chains = 1 +Samples per chain = 100000 +internals = lp +parameters = μ, σ + +2-element Array{ChainDataFrame,1} + +Summary Statistics + +│ Row │ parameters │ mean │ std │ naive_se │ mcse │ ess │ r_hat │ +│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Any │ Any │ +├─────┼────────────┼─────────┼──────────┼────────────┼────────────┼─────────┼─────────┤ +│ 1 │ μ │ 5.33157 │ 0.854193 │ 0.0027012 │ 0.00893069 │ 8344.75 │ 1.00009 │ +│ 2 │ σ │ 4.54992 │ 0.632916 │ 0.00200146 │ 0.00534942 │ 14260.8 │ 1.00005 │ + +Quantiles + +│ Row │ parameters │ 2.5% │ 25.0% │ 50.0% │ 75.0% │ 97.5% │ +│ │ Symbol │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ +├─────┼────────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ +│ 1 │ μ │ 3.6595 │ 4.77754 │ 5.33182 │ 5.89509 │ 6.99651 │ +│ 2 │ σ │ 3.5097 │ 4.09732 │ 4.47805 │ 4.93094 │ 5.96821 │ +``` + +It looks like we're extremely close to our true parameters of `Normal(5,3)`, though with a fairly high variance due to the low sample size. + +## Conclusion + +We've seen how to implement the sampling interface for general projects. Turing's interface methods are ever-evolving, so please open an issue at [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) with feature requests or problems. diff --git a/developers/inference/abstractmcmc-turing/index.qmd b/developers/inference/abstractmcmc-turing/index.qmd index 6d313f232..bf1bd1489 100755 --- a/developers/inference/abstractmcmc-turing/index.qmd +++ b/developers/inference/abstractmcmc-turing/index.qmd @@ -1,329 +1,329 @@ ---- -title: How Turing Implements AbstractMCMC -engine: julia -aliases: - - ../../tutorials/docs-04-for-developers-abstractmcmc-turing/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -Prerequisite: [Interface guide]({{}}). - -## Introduction - -Consider the following Turing, code block: - -```{julia} -using Turing - -@model function gdemo(x, y) - s² ~ InverseGamma(2, 3) - m ~ Normal(0, sqrt(s²)) - x ~ Normal(m, sqrt(s²)) - return y ~ Normal(m, sqrt(s²)) -end - -mod = gdemo(1.5, 2) -alg = IS() -n_samples = 1000 - -chn = sample(mod, alg, n_samples, progress=false) -``` - -The function `sample` is part of the AbstractMCMC interface. As explained in the [interface guide]({{}}), building a sampling method that can be used by `sample` consists in overloading the structs and functions in `AbstractMCMC`. The interface guide also gives a standalone example of their implementation, [`AdvancedMH.jl`](). - -Turing sampling methods (most of which are written [here](https://github.com/TuringLang/Turing.jl/tree/master/src/mcmc)) also implement `AbstractMCMC`. Turing defines a particular architecture for `AbstractMCMC` implementations, that enables working with models defined by the `@model` macro, and uses DynamicPPL as a backend. The goal of this page is to describe this architecture, and how you would go about implementing your own sampling method in Turing, using Importance Sampling as an example. I don't go into all the details: for instance, I don't address selectors or parallelism. - -First, we explain how Importance Sampling works in the abstract. Consider the model defined in the first code block. Mathematically, it can be written: - -$$ -\begin{align*} -s &\sim \text{InverseGamma}(2, 3), \\ -m &\sim \text{Normal}(0, \sqrt{s}), \\ -x &\sim \text{Normal}(m, \sqrt{s}), \\ -y &\sim \text{Normal}(m, \sqrt{s}). -\end{align*} -$$ - -The **latent** variables are $s$ and $m$, the **observed** variables are $x$ and $y$. The model **joint** distribution $p(s,m,x,y)$ decomposes into the **prior** $p(s,m)$ and the **likelihood** $p(x,y \mid s,m).$ Since $x = 1.5$ and $y = 2$ are observed, the goal is to infer the **posterior** distribution $p(s,m \mid x,y).$ - -Importance Sampling produces independent samples $(s_i, m_i)$ from the prior distribution. It also outputs unnormalized weights - -$$ -w_i = \frac {p(x,y,s_i,m_i)} {p(s_i, m_i)} = p(x,y \mid s_i, m_i) -$$ - -such that the empirical distribution - -$$ -\frac{1}{N} \sum_{i =1}^N \frac {w_i} {\sum_{j=1}^N w_j} \delta_{(s_i, m_i)} -$$ - -is a good approximation of the posterior. - -## 1. Define a Sampler - -Recall the last line of the above code block: - -```{julia} -chn = sample(mod, alg, n_samples, progress=false) -``` - -Here `sample` takes as arguments a **model** `mod`, an **algorithm** `alg`, and a **number of samples** `n_samples`, and returns an instance `chn` of `Chains` which can be analysed using the functions in `MCMCChains`. - -### Models - -To define a **model**, you declare a joint distribution on variables in the `@model` macro, and specify which variables are observed and which should be inferred, as well as the value of the observed variables. Thus, when implementing Importance Sampling, - -```{julia} -mod = gdemo(1.5, 2) -``` - -creates an instance `mod` of the struct `Model`, which corresponds to the observations of a value of `1.5` for `x`, and a value of `2` for `y`. - -This is all handled by DynamicPPL, more specifically [here](https://github.com/TuringLang/DynamicPPL.jl/blob/master/src/model.jl). I will return to how models are used to inform sampling algorithms [below](#assumeobserve). - -### Algorithms - -An **algorithm** is just a sampling method: in Turing, it is a subtype of the abstract type `InferenceAlgorithm`. Defining an algorithm may require specifying a few high-level parameters. For example, "Hamiltonian Monte-Carlo" may be too vague, but "Hamiltonian Monte Carlo with 10 leapfrog steps per proposal and a stepsize of 0.01" is an algorithm. "Metropolis-Hastings" may be too vague, but "Metropolis-Hastings with proposal distribution `p`" is an algorithm. -Thus - -```{julia} -stepsize = 0.01 -L = 10 -alg = HMC(stepsize, L) -``` - -defines a Hamiltonian Monte-Carlo algorithm, an instance of `HMC`, which is a subtype of `InferenceAlgorithm`. - -In the case of Importance Sampling, there is no need to specify additional parameters: - -```{julia} -alg = IS() -``` - -defines an Importance Sampling algorithm, an instance of `IS`, a subtype of `InferenceAlgorithm`. - -When creating your own Turing sampling method, you must, therefore, build a subtype of `InferenceAlgorithm` corresponding to your method. - -### Samplers - -Samplers are **not** the same as algorithms. An algorithm is a generic sampling method, a sampler is an object that stores information about how algorithm and model interact during sampling, and is modified as sampling progresses. The `Sampler` struct is defined in DynamicPPL. - -Turing implements `AbstractMCMC`'s `AbstractSampler` with the `Sampler` struct defined in `DynamicPPL`. The most important attributes of an instance `spl` of `Sampler` are: - -- `spl.alg`: the sampling method used, an instance of a subtype of `InferenceAlgorithm` -- `spl.state`: information about the sampling process, see [below](#states) - -When you call `sample(mod, alg, n_samples)`, Turing first uses `model` and `alg` to build an instance `spl` of `Sampler` , then calls the native `AbstractMCMC` function `sample(mod, spl, n_samples)`. - -When you define your own Turing sampling method, you must therefore build: - -- a **sampler constructor** that uses a model and an algorithm to initialize an instance of `Sampler`. For Importance Sampling: - -```{julia} -#| eval: false -function Sampler(alg::IS, model::Model, s::Selector) - info = Dict{Symbol,Any}() - state = ISState(model) - return Sampler(alg, info, s, state) -end -``` - -- a **state** struct implementing `AbstractSamplerState` corresponding to your method: we cover this in the following paragraph. - -### States - -The `vi` field contains all the important information about sampling: first and foremost, the values of all the samples, but also the distributions from which they are sampled, the names of model parameters, and other metadata. As we will see below, many important steps during sampling correspond to queries or updates to `spl.state.vi`. - -By default, you can use `SamplerState`, a concrete type defined in `inference/Inference.jl`, which extends `AbstractSamplerState` and has no field except for `vi`: - -```{julia} -#| eval: false -mutable struct SamplerState{VIType<:VarInfo} <: AbstractSamplerState - vi::VIType -end -``` - -When doing Importance Sampling, we care not only about the values of the samples but also their weights. We will see below that the weight of each sample is also added to `spl.state.vi`. Moreover, the average - -$$ -\frac 1 N \sum_{j=1}^N w_i = \frac 1 N \sum_{j=1}^N p(x,y \mid s_i, m_i) -$$ - -of the sample weights is a particularly important quantity: - -- it is used to **normalize** the **empirical approximation** of the posterior distribution -- its logarithm is the importance sampling **estimate** of the **log evidence** $\log p(x, y)$ - -To avoid having to compute it over and over again, `is.jl`defines an IS-specific concrete type `ISState` for sampler states, with an additional field `final_logevidence` containing - -$$ -\log \frac 1 N \sum_{j=1}^N w_i. -$$ - -```{julia} -#| eval: false -mutable struct ISState{V<:VarInfo,F<:AbstractFloat} <: AbstractSamplerState - vi::V - final_logevidence::F -end - -# additional constructor -ISState(model::Model) = ISState(VarInfo(model), 0.0) -``` - -The following diagram summarizes the hierarchy presented above. - -```{dot} -//| echo: false -digraph G { - node [shape=box]; - - spl [label=Sampler
<:AbstractSampler>, style=rounded, xlabel="", shape=box]; - state [label=State
<:AbstractSamplerState>, style=rounded, xlabel="", shape=box]; - alg [label=Algorithm
<:InferenceAlgorithm>, style=rounded, xlabel="", shape=box]; - vi [label=VarInfo
<:AbstractVarInfo>, style=rounded, xlabel="", shape=box]; - placeholder1 [label="...", width=1]; - placeholder2 [label="...", width=1]; - placeholder3 [label="...", width=1]; - placeholder4 [label="...", width=1]; - - spl -> state; - spl -> alg; - spl -> placeholder1; - - state -> vi; - state -> placeholder2; - - alg -> placeholder3; - placeholder1 -> placeholder4; -} -``` - -## 2. Overload the functions used inside mcmcsample - -A lot of the things here are method-specific. However, Turing also has some functions that make it easier for you to implement these functions, for example. - -### Transitions - -`AbstractMCMC` stores information corresponding to each individual sample in objects called `transition`, but does not specify what the structure of these objects could be. You could decide to implement a type `MyTransition` for transitions corresponding to the specifics of your methods. However, there are many situations in which the only information you need for each sample is: - -- its value: $\theta$ -- log of the joint probability of the observed data and this sample: `lp` - -`Inference.jl` [defines](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L103) a struct `Transition`, which corresponds to this default situation - -```{julia} -#| eval: false -struct Transition{T,F<:AbstractFloat} - θ::T - lp::F -end -``` - -It also [contains](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L108) a constructor that builds an instance of `Transition` from an instance `spl` of `Sampler`: $\theta$ is `spl.state.vi` converted to a `namedtuple`, and `lp` is `getlogp(spl.state.vi)`. `is.jl` uses this default constructor at the end of the `step!` function [here](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/is.jl#L58). - -### How `sample` works - -A crude summary, which ignores things like parallelism, is the following: - -`sample` calls `mcmcsample`, which calls - -- `sample_init!` to set things up -- `step!` repeatedly to produce multiple new transitions -- `sample_end!` to perform operations once all samples have been obtained -- `bundle_samples` to convert a vector of transitions into a more palatable type, for instance a `Chain`. - -You can, of course, implement all of these functions, but `AbstractMCMC` as well as Turing, also provide default implementations for simple cases. For instance, importance sampling uses the default implementations of `sample_init!` and `bundle_samples`, which is why you don't see code for them inside `is.jl`. - -## 3. Overload assume and observe - -The functions mentioned above, such as `sample_init!`, `step!`, etc., must, of course, use information about the model in order to generate samples! In particular, these functions may need **samples from distributions** defined in the model or to **evaluate the density of these distributions** at some values of the corresponding parameters or observations. - -For an example of the former, consider **Importance Sampling** as defined in `is.jl`. This implementation of Importance Sampling uses the model prior distribution as a proposal distribution, and therefore requires **samples from the prior distribution** of the model. Another example is **Approximate Bayesian Computation**, which requires multiple **samples from the model prior and likelihood distributions** in order to generate a single sample. - -An example of the latter is the **Metropolis-Hastings** algorithm. At every step of sampling from a target posterior - -$$ -p(\theta \mid x_{\text{obs}}), -$$ - -in order to compute the acceptance ratio, you need to **evaluate the model joint density** - -$$ -p\left(\theta_{\text{prop}}, x_{\text{obs}}\right) -$$ - -with $\theta_{\text{prop}}$ a sample from the proposal and $x_{\text{obs}}$ the observed data. - -This begs the question: how can these functions access model information during sampling? Recall that the model is stored as an instance `m` of `Model`. One of the attributes of `m` is the model evaluation function `m.f`, which is built by compiling the `@model` macro. Executing `f` runs the tilde statements of the model in order, and adds model information to the sampler (the instance of `Sampler` that stores information about the ongoing sampling process) at each step (see [here](https://turinglang.org/dev/docs/for-developers/compiler) for more information about how the `@model` macro is compiled). The DynamicPPL functions `assume` and `observe` determine what kind of information to add to the sampler for every tilde statement. - -Consider an instance `m` of `Model` and a sampler `spl`, with associated `VarInfo` `vi = spl.state.vi`. At some point during the sampling process, an AbstractMCMC function such as `step!` calls `m(vi, ...)`, which calls the model evaluation function `m.f(vi, ...)`. - - - for every tilde statement in the `@model` macro, `m.f(vi, ...)` returns model-related information (samples, value of the model density, etc.), and adds it to `vi`. How does it do that? - - + recall that the code for `m.f(vi, ...)` is automatically generated by compilation of the `@model` macro - - + for every tilde statement in the `@model` declaration, this code contains a call to `assume(vi, ...)` if the variable on the LHS of the tilde is a **model parameter to infer**, and `observe(vi, ...)` if the variable on the LHS of the tilde is an **observation** - - + in the file corresponding to your sampling method (ie in `Turing.jl/src/inference/.jl`), you have **overloaded** `assume` and `observe`, so that they can modify `vi` to include the information and samples that you care about! - - + at a minimum, `assume` and `observe` return the log density `lp` of the sample or observation. the model evaluation function then immediately calls `acclogp!!(vi, lp)`, which adds `lp` to the value of the log joint density stored in `vi`. - -Here's what `assume` looks like for Importance Sampling: - -```{julia} -#| eval: false -function DynamicPPL.assume(rng, spl::Sampler{<:IS}, dist::Distribution, vn::VarName, vi) - r = rand(rng, dist) - push!(vi, vn, r, dist, spl) - return r, 0 -end -``` - -The function first generates a sample `r` from the distribution `dist` (the right hand side of the tilde statement). It then adds `r` to `vi`, and returns `r` and 0. - -The `observe` function is even simpler: - -```{julia} -#| eval: false -function DynamicPPL.observe(spl::Sampler{<:IS}, dist::Distribution, value, vi) - return logpdf(dist, value) -end -``` - -It simply returns the density (in the discrete case, the probability) of the observed value under the distribution `dist`. - -## 4. Summary: Importance Sampling step by step - -We focus on the AbstractMCMC functions that are overridden in `is.jl` and executed inside `mcmcsample`: `step!`, which is called `n_samples` times, and `sample_end!`, which is executed once after those `n_samples` iterations. - - - During the $i$-th iteration, `step!` does 3 things: - - + `empty!!(spl.state.vi)`: remove information about the previous sample from the sampler's `VarInfo` - - + `model(rng, spl.state.vi, spl)`: call the model evaluation function - - * calls to `assume` add the samples from the prior $s_i$ and $m_i$ to `spl.state.vi` - - * calls to `assume` or `observe` are followed by the line `acclogp!!(vi, lp)`, where `lp` is an output of `assume` and `observe` - - * `lp` is set to 0 after `assume`, and to the value of the density at the observation after `observe` - - * When all the tilde statements have been covered, `spl.state.vi.logp[]` is the sum of the `lp`, i.e., the likelihood $\log p(x, y \mid s_i, m_i) = \log p(x \mid s_i, m_i) + \log p(y \mid s_i, m_i)$ of the observations given the latent variable samples $s_i$ and $m_i$. - - + `return Transition(spl)`: build a transition from the sampler, and return that transition - - * the transition's `vi` field is simply `spl.state.vi` - - * the `lp` field contains the likelihood `spl.state.vi.logp[]` - - - When the `n_samples` iterations are completed, `sample_end!` fills the `final_logevidence` field of `spl.state` - - + It simply takes the logarithm of the average of the sample weights, using the log weights for numerical stability +--- +title: How Turing Implements AbstractMCMC +engine: julia +aliases: + - ../../tutorials/docs-04-for-developers-abstractmcmc-turing/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +Prerequisite: [Interface guide]({{}}). + +## Introduction + +Consider the following Turing, code block: + +```{julia} +using Turing + +@model function gdemo(x, y) + s² ~ InverseGamma(2, 3) + m ~ Normal(0, sqrt(s²)) + x ~ Normal(m, sqrt(s²)) + return y ~ Normal(m, sqrt(s²)) +end + +mod = gdemo(1.5, 2) +alg = IS() +n_samples = 1000 + +chn = sample(mod, alg, n_samples, progress=false) +``` + +The function `sample` is part of the AbstractMCMC interface. As explained in the [interface guide]({{}}), building a sampling method that can be used by `sample` consists in overloading the structs and functions in `AbstractMCMC`. The interface guide also gives a standalone example of their implementation, [`AdvancedMH.jl`](). + +Turing sampling methods (most of which are written [here](https://github.com/TuringLang/Turing.jl/tree/master/src/mcmc)) also implement `AbstractMCMC`. Turing defines a particular architecture for `AbstractMCMC` implementations, that enables working with models defined by the `@model` macro, and uses DynamicPPL as a backend. The goal of this page is to describe this architecture, and how you would go about implementing your own sampling method in Turing, using Importance Sampling as an example. I don't go into all the details: for instance, I don't address selectors or parallelism. + +First, we explain how Importance Sampling works in the abstract. Consider the model defined in the first code block. Mathematically, it can be written: + +$$ +\begin{align*} +s &\sim \text{InverseGamma}(2, 3), \\ +m &\sim \text{Normal}(0, \sqrt{s}), \\ +x &\sim \text{Normal}(m, \sqrt{s}), \\ +y &\sim \text{Normal}(m, \sqrt{s}). +\end{align*} +$$ + +The **latent** variables are $s$ and $m$, the **observed** variables are $x$ and $y$. The model **joint** distribution $p(s,m,x,y)$ decomposes into the **prior** $p(s,m)$ and the **likelihood** $p(x,y \mid s,m).$ Since $x = 1.5$ and $y = 2$ are observed, the goal is to infer the **posterior** distribution $p(s,m \mid x,y).$ + +Importance Sampling produces independent samples $(s_i, m_i)$ from the prior distribution. It also outputs unnormalized weights + +$$ +w_i = \frac {p(x,y,s_i,m_i)} {p(s_i, m_i)} = p(x,y \mid s_i, m_i) +$$ + +such that the empirical distribution + +$$ +\frac{1}{N} \sum_{i =1}^N \frac {w_i} {\sum_{j=1}^N w_j} \delta_{(s_i, m_i)} +$$ + +is a good approximation of the posterior. + +## 1. Define a Sampler + +Recall the last line of the above code block: + +```{julia} +chn = sample(mod, alg, n_samples, progress=false) +``` + +Here `sample` takes as arguments a **model** `mod`, an **algorithm** `alg`, and a **number of samples** `n_samples`, and returns an instance `chn` of `Chains` which can be analysed using the functions in `MCMCChains`. + +### Models + +To define a **model**, you declare a joint distribution on variables in the `@model` macro, and specify which variables are observed and which should be inferred, as well as the value of the observed variables. Thus, when implementing Importance Sampling, + +```{julia} +mod = gdemo(1.5, 2) +``` + +creates an instance `mod` of the struct `Model`, which corresponds to the observations of a value of `1.5` for `x`, and a value of `2` for `y`. + +This is all handled by DynamicPPL, more specifically [here](https://github.com/TuringLang/DynamicPPL.jl/blob/master/src/model.jl). I will return to how models are used to inform sampling algorithms [below](#assumeobserve). + +### Algorithms + +An **algorithm** is just a sampling method: in Turing, it is a subtype of the abstract type `InferenceAlgorithm`. Defining an algorithm may require specifying a few high-level parameters. For example, "Hamiltonian Monte-Carlo" may be too vague, but "Hamiltonian Monte Carlo with 10 leapfrog steps per proposal and a stepsize of 0.01" is an algorithm. "Metropolis-Hastings" may be too vague, but "Metropolis-Hastings with proposal distribution `p`" is an algorithm. +Thus + +```{julia} +stepsize = 0.01 +L = 10 +alg = HMC(stepsize, L) +``` + +defines a Hamiltonian Monte-Carlo algorithm, an instance of `HMC`, which is a subtype of `InferenceAlgorithm`. + +In the case of Importance Sampling, there is no need to specify additional parameters: + +```{julia} +alg = IS() +``` + +defines an Importance Sampling algorithm, an instance of `IS`, a subtype of `InferenceAlgorithm`. + +When creating your own Turing sampling method, you must, therefore, build a subtype of `InferenceAlgorithm` corresponding to your method. + +### Samplers + +Samplers are **not** the same as algorithms. An algorithm is a generic sampling method, a sampler is an object that stores information about how algorithm and model interact during sampling, and is modified as sampling progresses. The `Sampler` struct is defined in DynamicPPL. + +Turing implements `AbstractMCMC`'s `AbstractSampler` with the `Sampler` struct defined in `DynamicPPL`. The most important attributes of an instance `spl` of `Sampler` are: + +- `spl.alg`: the sampling method used, an instance of a subtype of `InferenceAlgorithm` +- `spl.state`: information about the sampling process, see [below](#states) + +When you call `sample(mod, alg, n_samples)`, Turing first uses `model` and `alg` to build an instance `spl` of `Sampler` , then calls the native `AbstractMCMC` function `sample(mod, spl, n_samples)`. + +When you define your own Turing sampling method, you must therefore build: + +- a **sampler constructor** that uses a model and an algorithm to initialize an instance of `Sampler`. For Importance Sampling: + +```{julia} +#| eval: false +function Sampler(alg::IS, model::Model, s::Selector) + info = Dict{Symbol,Any}() + state = ISState(model) + return Sampler(alg, info, s, state) +end +``` + +- a **state** struct implementing `AbstractSamplerState` corresponding to your method: we cover this in the following paragraph. + +### States + +The `vi` field contains all the important information about sampling: first and foremost, the values of all the samples, but also the distributions from which they are sampled, the names of model parameters, and other metadata. As we will see below, many important steps during sampling correspond to queries or updates to `spl.state.vi`. + +By default, you can use `SamplerState`, a concrete type defined in `inference/Inference.jl`, which extends `AbstractSamplerState` and has no field except for `vi`: + +```{julia} +#| eval: false +mutable struct SamplerState{VIType<:VarInfo} <: AbstractSamplerState + vi::VIType +end +``` + +When doing Importance Sampling, we care not only about the values of the samples but also their weights. We will see below that the weight of each sample is also added to `spl.state.vi`. Moreover, the average + +$$ +\frac 1 N \sum_{j=1}^N w_i = \frac 1 N \sum_{j=1}^N p(x,y \mid s_i, m_i) +$$ + +of the sample weights is a particularly important quantity: + +- it is used to **normalize** the **empirical approximation** of the posterior distribution +- its logarithm is the importance sampling **estimate** of the **log evidence** $\log p(x, y)$ + +To avoid having to compute it over and over again, `is.jl`defines an IS-specific concrete type `ISState` for sampler states, with an additional field `final_logevidence` containing + +$$ +\log \frac 1 N \sum_{j=1}^N w_i. +$$ + +```{julia} +#| eval: false +mutable struct ISState{V<:VarInfo,F<:AbstractFloat} <: AbstractSamplerState + vi::V + final_logevidence::F +end + +# additional constructor +ISState(model::Model) = ISState(VarInfo(model), 0.0) +``` + +The following diagram summarizes the hierarchy presented above. + +```{dot} +//| echo: false +digraph G { + node [shape=box]; + + spl [label=Sampler
<:AbstractSampler>, style=rounded, xlabel="", shape=box]; + state [label=State
<:AbstractSamplerState>, style=rounded, xlabel="", shape=box]; + alg [label=Algorithm
<:InferenceAlgorithm>, style=rounded, xlabel="", shape=box]; + vi [label=VarInfo
<:AbstractVarInfo>, style=rounded, xlabel="", shape=box]; + placeholder1 [label="...", width=1]; + placeholder2 [label="...", width=1]; + placeholder3 [label="...", width=1]; + placeholder4 [label="...", width=1]; + + spl -> state; + spl -> alg; + spl -> placeholder1; + + state -> vi; + state -> placeholder2; + + alg -> placeholder3; + placeholder1 -> placeholder4; +} +``` + +## 2. Overload the functions used inside mcmcsample + +A lot of the things here are method-specific. However, Turing also has some functions that make it easier for you to implement these functions, for example. + +### Transitions + +`AbstractMCMC` stores information corresponding to each individual sample in objects called `transition`, but does not specify what the structure of these objects could be. You could decide to implement a type `MyTransition` for transitions corresponding to the specifics of your methods. However, there are many situations in which the only information you need for each sample is: + +- its value: $\theta$ +- log of the joint probability of the observed data and this sample: `lp` + +`Inference.jl` [defines](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L103) a struct `Transition`, which corresponds to this default situation + +```{julia} +#| eval: false +struct Transition{T,F<:AbstractFloat} + θ::T + lp::F +end +``` + +It also [contains](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/Inference.jl#L108) a constructor that builds an instance of `Transition` from an instance `spl` of `Sampler`: $\theta$ is `spl.state.vi` converted to a `namedtuple`, and `lp` is `getlogp(spl.state.vi)`. `is.jl` uses this default constructor at the end of the `step!` function [here](https://github.com/TuringLang/Turing.jl/blob/master/src/inference/is.jl#L58). + +### How `sample` works + +A crude summary, which ignores things like parallelism, is the following: + +`sample` calls `mcmcsample`, which calls + +- `sample_init!` to set things up +- `step!` repeatedly to produce multiple new transitions +- `sample_end!` to perform operations once all samples have been obtained +- `bundle_samples` to convert a vector of transitions into a more palatable type, for instance a `Chain`. + +You can, of course, implement all of these functions, but `AbstractMCMC` as well as Turing, also provide default implementations for simple cases. For instance, importance sampling uses the default implementations of `sample_init!` and `bundle_samples`, which is why you don't see code for them inside `is.jl`. + +## 3. Overload assume and observe + +The functions mentioned above, such as `sample_init!`, `step!`, etc., must, of course, use information about the model in order to generate samples! In particular, these functions may need **samples from distributions** defined in the model or to **evaluate the density of these distributions** at some values of the corresponding parameters or observations. + +For an example of the former, consider **Importance Sampling** as defined in `is.jl`. This implementation of Importance Sampling uses the model prior distribution as a proposal distribution, and therefore requires **samples from the prior distribution** of the model. Another example is **Approximate Bayesian Computation**, which requires multiple **samples from the model prior and likelihood distributions** in order to generate a single sample. + +An example of the latter is the **Metropolis-Hastings** algorithm. At every step of sampling from a target posterior + +$$ +p(\theta \mid x_{\text{obs}}), +$$ + +in order to compute the acceptance ratio, you need to **evaluate the model joint density** + +$$ +p\left(\theta_{\text{prop}}, x_{\text{obs}}\right) +$$ + +with $\theta_{\text{prop}}$ a sample from the proposal and $x_{\text{obs}}$ the observed data. + +This begs the question: how can these functions access model information during sampling? Recall that the model is stored as an instance `m` of `Model`. One of the attributes of `m` is the model evaluation function `m.f`, which is built by compiling the `@model` macro. Executing `f` runs the tilde statements of the model in order, and adds model information to the sampler (the instance of `Sampler` that stores information about the ongoing sampling process) at each step (see [here](https://turinglang.org/dev/docs/for-developers/compiler) for more information about how the `@model` macro is compiled). The DynamicPPL functions `assume` and `observe` determine what kind of information to add to the sampler for every tilde statement. + +Consider an instance `m` of `Model` and a sampler `spl`, with associated `VarInfo` `vi = spl.state.vi`. At some point during the sampling process, an AbstractMCMC function such as `step!` calls `m(vi, ...)`, which calls the model evaluation function `m.f(vi, ...)`. + + - for every tilde statement in the `@model` macro, `m.f(vi, ...)` returns model-related information (samples, value of the model density, etc.), and adds it to `vi`. How does it do that? + + + recall that the code for `m.f(vi, ...)` is automatically generated by compilation of the `@model` macro + + + for every tilde statement in the `@model` declaration, this code contains a call to `assume(vi, ...)` if the variable on the LHS of the tilde is a **model parameter to infer**, and `observe(vi, ...)` if the variable on the LHS of the tilde is an **observation** + + + in the file corresponding to your sampling method (ie in `Turing.jl/src/inference/.jl`), you have **overloaded** `assume` and `observe`, so that they can modify `vi` to include the information and samples that you care about! + + + at a minimum, `assume` and `observe` return the log density `lp` of the sample or observation. the model evaluation function then immediately calls `acclogp!!(vi, lp)`, which adds `lp` to the value of the log joint density stored in `vi`. + +Here's what `assume` looks like for Importance Sampling: + +```{julia} +#| eval: false +function DynamicPPL.assume(rng, spl::Sampler{<:IS}, dist::Distribution, vn::VarName, vi) + r = rand(rng, dist) + push!(vi, vn, r, dist, spl) + return r, 0 +end +``` + +The function first generates a sample `r` from the distribution `dist` (the right hand side of the tilde statement). It then adds `r` to `vi`, and returns `r` and 0. + +The `observe` function is even simpler: + +```{julia} +#| eval: false +function DynamicPPL.observe(spl::Sampler{<:IS}, dist::Distribution, value, vi) + return logpdf(dist, value) +end +``` + +It simply returns the density (in the discrete case, the probability) of the observed value under the distribution `dist`. + +## 4. Summary: Importance Sampling step by step + +We focus on the AbstractMCMC functions that are overridden in `is.jl` and executed inside `mcmcsample`: `step!`, which is called `n_samples` times, and `sample_end!`, which is executed once after those `n_samples` iterations. + + - During the $i$-th iteration, `step!` does 3 things: + + + `empty!!(spl.state.vi)`: remove information about the previous sample from the sampler's `VarInfo` + + + `model(rng, spl.state.vi, spl)`: call the model evaluation function + + * calls to `assume` add the samples from the prior $s_i$ and $m_i$ to `spl.state.vi` + + * calls to `assume` or `observe` are followed by the line `acclogp!!(vi, lp)`, where `lp` is an output of `assume` and `observe` + + * `lp` is set to 0 after `assume`, and to the value of the density at the observation after `observe` + + * When all the tilde statements have been covered, `spl.state.vi.logp[]` is the sum of the `lp`, i.e., the likelihood $\log p(x, y \mid s_i, m_i) = \log p(x \mid s_i, m_i) + \log p(y \mid s_i, m_i)$ of the observations given the latent variable samples $s_i$ and $m_i$. + + + `return Transition(spl)`: build a transition from the sampler, and return that transition + + * the transition's `vi` field is simply `spl.state.vi` + + * the `lp` field contains the likelihood `spl.state.vi.logp[]` + + - When the `n_samples` iterations are completed, `sample_end!` fills the `final_logevidence` field of `spl.state` + + + It simply takes the logarithm of the average of the sample weights, using the log weights for numerical stability diff --git a/developers/inference/implementing-samplers/index.qmd b/developers/inference/implementing-samplers/index.qmd index 9d69fbb80..847c2f117 100644 --- a/developers/inference/implementing-samplers/index.qmd +++ b/developers/inference/implementing-samplers/index.qmd @@ -1,495 +1,495 @@ ---- -title: Implementing Samplers -engine: julia -julia: - exeflags: ["--project=@.", "-t 4"] -aliases: - - ../../tutorials/docs-17-implementing-samplers/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this tutorial, we'll go through step-by-step how to implement a "simple" sampler in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) in such a way that it can be easily applied to Turing.jl models. - -In particular, we're going to implement a version of **Metropolis-adjusted Langevin (MALA)**. - -Note that we will implement this sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) framework, completely "ignoring" Turing.jl until the very end of the tutorial, at which point we'll use a single line of code to make the resulting sampler available to Turing.jl. This is to really drive home the point that one can implement samplers in a way that is accessible to all of Turing.jl's users without having to use Turing.jl yourself. - - -## Quick overview of MALA - -We can view MALA as a single step of the leapfrog intergrator with resampling of momentum $p$ at every step.[^2] To make that statement a bit more concrete, we first define the *extended* target $\bar{\gamma}(x, p)$ as - -\begin{equation*} -\log \bar{\gamma}(x, p) \propto \log \gamma(x) + \log \gamma_{\mathcal{N}(0, M)}(p) -\end{equation*} - -where $\gamma_{\mathcal{N}(0, M)}$ denotes the density for a zero-centered Gaussian with covariance matrix $M$. -We then consider targeting this joint distribution over both $x$ and $p$ as follows. -First we define the map - -\begin{equation*} -\begin{split} - L_{\epsilon}: \quad & \mathbb{R}^d \times \mathbb{R}^d \to \mathbb{R}^d \times \mathbb{R}^d \\ - & (x, p) \mapsto (\tilde{x}, \tilde{p}) := L_{\epsilon}(x, p) -\end{split} -\end{equation*} - -as - -\begin{equation*} -\begin{split} - p_{1 / 2} &:= p + \frac{\epsilon}{2} \nabla \log \gamma(x) \\ - \tilde{x} &:= x + \epsilon M^{-1} p_{1 /2 } \\ - p_1 &:= p_{1 / 2} + \frac{\epsilon}{2} \nabla \log \gamma(\tilde{x}) \\ - \tilde{p} &:= - p_1 -\end{split} -\end{equation*} - -This might be familiar for some readers as a single step of the Leapfrog integrator. -We then define the MALA kernel as follows: given the current iterate $x_i$, we sample the next iterate $x_{i + 1}$ as - -\begin{equation*} -\begin{split} - p &\sim \mathcal{N}(0, M) \\ - (\tilde{x}, \tilde{p}) &:= L_{\epsilon}(x_i, p) \\ - \alpha &:= \min \left\{ 1, \frac{\bar{\gamma}(\tilde{x}, \tilde{p})}{\bar{\gamma}(x_i, p)} \right\} \\ - x_{i + 1} &:= - \begin{cases} - \tilde{x} \quad & \text{ with prob. } \alpha \\ - x_i \quad & \text{ with prob. } 1 - \alpha - \end{cases} -\end{split} -\end{equation*} - -i.e. we accept the proposal $\tilde{x}$ with probability $\alpha$ and reject it, thus sticking with our current iterate, with probability $1 - \alpha$. - -## What we need from a model: [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) - -There are a few things we need from the "target" / "model" / density that we want to sample from: - -1. We need access to log-density *evaluations* $\log \gamma(x)$ so we can compute the acceptance ratio involving $\log \bar{\gamma}(x, p)$. -2. We need access to log-density *gradients* $\nabla \log \gamma(x)$ so we can compute the Leapfrog steps $L_{\epsilon}(x, p)$. -3. We also need access to the "size" of the model so we can determine the size of $M$. - -Luckily for us, there is a package called [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) which provides an interface for *exactly* this! - -To demonstrate how one can implement the "[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface"[^1] we will use a simple Gaussian model as an example: - -```{julia} -using LogDensityProblems: LogDensityProblems; - -# Let's define some type that represents the model. -struct IsotropicNormalModel{M<:AbstractVector{<:Real}} - "mean of the isotropic Gaussian" - mean::M -end - -# Specifies what input length the model expects. -LogDensityProblems.dimension(model::IsotropicNormalModel) = length(model.mean) -# Implementation of the log-density evaluation of the model. -function LogDensityProblems.logdensity(model::IsotropicNormalModel, x::AbstractVector{<:Real}) - return - sum(abs2, x .- model.mean) / 2 -end -``` - -This gives us all of the properties we want for our MALA sampler with the exception of the computation of the *gradient* $\nabla \log \gamma(x)$. There is the method `LogDensityProblems.logdensity_and_gradient` which should return a 2-tuple where the first entry is the evaluation of the logdensity $\log \gamma(x)$ and the second entry is the gradient $\nabla \log \gamma(x)$. - -There are two ways to "implement" this method: 1) we implement it by hand, which is feasible in the case of our `IsotropicNormalModel`, or b) we defer the implementation of this to a automatic differentiation backend. - -To implement it by hand we can simply do - -```{julia} -# Tell LogDensityProblems.jl that first-order, i.e. gradient information, is available. -LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{1}() - -# Implement `logdensity_and_gradient`. -function LogDensityProblems.logdensity_and_gradient(model::IsotropicNormalModel, x) - logγ_x = LogDensityProblems.logdensity(model, x) - ∇logγ_x = -x .* (x - model.mean) - return logγ_x, ∇logγ_x -end -``` - -Let's just try it out: - -```{julia} -# Instantiate the problem. -model = IsotropicNormalModel([-5., 0., 5.]) -# Create some example input that we can test on. -x_example = randn(LogDensityProblems.dimension(model)) -# Evaluate! -LogDensityProblems.logdensity(model, x_example) -``` - -To defer it to an automatic differentiation backend, we can do - -```{julia} -# Tell LogDensityProblems.jl we only have access to 0-th order information. -LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{0}() - -# Use `LogDensityProblemsAD`'s `ADgradient` in combination with some AD backend to implement `logdensity_and_gradient`. -using LogDensityProblemsAD, ADTypes, ForwardDiff -model_with_grad = ADgradient(AutoForwardDiff(), model) -LogDensityProblems.logdensity(model_with_grad, x_example) -``` - -We'll continue with the second approach in this tutorial since this is typically what one does in practice, because there are better hobbies to spend time on than deriving gradients by hand. - -At this point, one might wonder how we're going to tie this back to Turing.jl in the end. Effectively, when working with inference methods that only require log-density evaluations and / or higher-order information of the log-density, Turing.jl actually converts the user-provided `Model` into an object implementing the above methods for [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl). As a result, most samplers provided by Turing.jl are actually implemented to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), enabling their use both *within* Turing.jl and *outside* of Turing.jl! Morever, there exists similar conversions for Stan through BridgeStan and Stan[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), which means that a sampler supporting the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface can easily be used on both Turing.jl *and* Stan models (in addition to user-provided models, as our `IsotropicNormalModel` above)! - -Anyways, let's move on to actually implementing the sampler. - -## Implementing MALA in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) - -Now that we've established that a model implementing the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface provides us with all the information we need from $\log \gamma(x)$, we can address the question: given an object that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, how can we define a sampler for it? - -We're going to do this by making our sampler a sub-type of `AbstractMCMC.AbstractSampler` in addition to implementing a few methods from [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl). Why? Because it gets us *a lot* of functionality for free, as we will see later. - -Moreover, [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) provides a very natural interface for MCMC algorithms. - -First, we'll define our `MALA` type - -```{julia} -using AbstractMCMC - -struct MALA{T,A} <: AbstractMCMC.AbstractSampler - "stepsize used in the leapfrog step" - ϵ_init::T - "covariance matrix used for the momentum" - M_init::A -end -``` - -Notice how we've added the suffix `_init` to both the stepsize and the covariance matrix. We've done this because a `AbstractMCMC.AbstractSampler` should be *immutable*. Of course there might be many scenarios where we want to allow something like the stepsize and / or the covariance matrix to vary between iterations, e.g. during the burn-in / adaptation phase of the sampling process we might want to adjust the parameters using statistics computed from these initial iterations. But information which can change between iterations *should not go in the sampler itself*! Instead, this information should go in the sampler *state*. - -The sampler state should at the very least contain all the necessary information to perform the next MCMC iteration, but usually contains further information, e.g. quantities and statistics useful for evaluating whether the sampler has converged. - -We will use the following sampler state for our `MALA` sampler: - -```{julia} -struct MALAState{A<:AbstractVector{<:Real}} - "current position" - x::A -end -``` - -This might seem overly redundant: we're defining a type `MALAState` and it only contains a simple vector of reals. -In this particular case we indeed could have dropped this and simply used a `AbstractVector{<:Real}` as our sampler state, but typically, as we will see later, one wants to include other quantities in the sampler state. -For example, if we also wanted to adapt the parameters of our `MALA`, e.g. alter the stepsize depending on acceptance rates, in which case we should also put `ϵ` in the state, but for now we'll keep things simple. - -Moreover, we also want a _sample_ type, which is a type meant for "public consumption", i.e. the end-user. This is generally going to contain a subset of the information present in the state. But in such a simple scenario as this, we similarly only have a `AbstractVector{<:Real}`: - -```{julia} -struct MALASample{A<:AbstractVector{<:Real}} - "current position" - x::A -end -``` - -We currently have three things: - -1. A `AbstractMCMC.AbstractSampler` implementation called `MALA`. -2. A state `MALAState` for our sampler `MALA`. -3. A sample `MALASample` for our sampler `MALA`. - -That means that we're ready to implement the only thing that really matters: `AbstractMCMC.step`. - -`AbstractMCMC.step` defines the MCMC iteration of our `MALA` given the current `MALAState`. Specifically, the signature of the function is as follows: - -```{julia} -#| eval: false -function AbstractMCMC.step( - # The RNG to ensure reproducibility. - rng::Random.AbstractRNG, - # The model that defines our target. - model::AbstractMCMC.AbstractModel, - # The sampler for which we're taking a `step`. - sampler::AbstractMCMC.AbstractSampler, - # The current sampler `state`. - state; - # Additional keyword arguments that we may or may not need. - kwargs... -) -``` - -Moreover, there is a specific `AbstractMCMC.AbstractModel` which is used to indicate that the model that is provided implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface: `AbstractMCMC.LogDensityModel`. - -Since, as we discussed earlier, in our case we're indeed going to work with types that support the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, we'll define `AbstractMCMC.step` for such a `AbstractMCMC.LogDensityModel`. - -Note that `AbstractMCMC.LogDensityModel` has no other purpose; it has a single field called `logdensity`, and it does nothing else. But by wrapping the model in `AbstractMCMC.LogDensityModel`, it allows samplers that want to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) to define their `AbstractMCMC.step` on this type without running into method ambiguities. - -All in all, that means that the signature for our `AbstractMCMC.step` is going to be the following: - -```{julia} -#| eval: false -function AbstractMCMC.step( - rng::Random.AbstractRNG, - # `LogDensityModel` so we know we're working with LogDensityProblems.jl model. - model::AbstractMCMC.LogDensityModel, - # Our sampler. - sampler::MALA, - # Our sampler state. - state::MALAState; - kwargs... -) -``` - -Great! Now let's actually implement the full `AbstractMCMC.step` for our `MALA`. - -Let's remind ourselves what we're going to do: - -1. Sample a new momentum $p$. -2. Compute the log-density of the extended target $\log \bar{\gamma}(x, p)$. -3. Take a single leapfrog step $(\tilde{x}, \tilde{p}) = L_{\epsilon}(x, p)$. -4. Accept or reject the proposed $(\tilde{x}, \tilde{p})$. - -All in all, this results in the following: - -```{julia} -using Random: Random -using Distributions # so we get the `MvNormal` - -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model_wrapper::AbstractMCMC.LogDensityModel, - sampler::MALA, - state::MALAState; - kwargs... -) - # Extract the wrapped model which implements LogDensityProblems.jl. - model = model_wrapper.logdensity - # Let's just extract the sampler parameters to make our lives easier. - ϵ = sampler.ϵ_init - M = sampler.M_init - # Extract the current parameters. - x = state.x - # Sample the momentum. - p_dist = MvNormal(zeros(LogDensityProblems.dimension(model)), M) - p = rand(rng, p_dist) - # Propose using a single leapfrog step. - x̃, p̃ = leapfrog_step(model, x, p, ϵ, M) - # Accept or reject proposal. - logp = LogDensityProblems.logdensity(model, x) + logpdf(p_dist, p) - logp̃ = LogDensityProblems.logdensity(model, x̃) + logpdf(p_dist, p̃) - logα = logp̃ - logp - state_new = if log(rand(rng)) < logα - # Accept. - MALAState(x̃) - else - # Reject. - MALAState(x) - end - # Return the "sample" and the sampler state. - return MALASample(state_new.x), state_new -end -``` - -Fairly straight-forward. - -Of course, we haven't defined the `leapfrog_step` method yet, so let's do that: - -```{julia} -function leapfrog_step(model, x, p, ϵ, M) - # Update momentum `p` using "position" `x`. - ∇logγ_x = last(LogDensityProblems.logdensity_and_gradient(model, x)) - p1 = p + (ϵ / 2) .* ∇logγ_x - # Update the "position" `x` using momentum `p1`. - x̃ = x + ϵ .* (M \ p1) - # Update momentum `p1` using position `x̃` - ∇logγ_x̃ = last(LogDensityProblems.logdensity_and_gradient(model, x̃)) - p2 = p1 + (ϵ / 2) .* ∇logγ_x̃ - # Flip momentum `p2`. - p̃ = -p2 - return x̃, p̃ -end -``` - -With all of this, we're technically ready to sample! - -```{julia} -using Random, LinearAlgebra - -rng = Random.default_rng() -sampler = MALA(1, I) -state = MALAState(zeros(LogDensityProblems.dimension(model))) - -x_next, state_next = AbstractMCMC.step( - rng, - AbstractMCMC.LogDensityModel(model), - sampler, - state -) -``` - -Great, it works! - -And I promised we would get quite some functionality for free if we implemented `AbstractMCMC.step`, and so we can now simply call `sample` to perform standard MCMC sampling: - -```{julia} -# Perform 1000 iterations with our `MALA` sampler. -samples = sample(model_with_grad, sampler, 10_000; initial_state=state, progress=false) -# Concatenate into a matrix. -samples_matrix = stack(sample -> sample.x, samples) -``` - -```{julia} -# Compute the marginal means and standard deviations. -hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) -``` - -Let's visualize the samples - -```{julia} -using StatsPlots -plot(transpose(samples_matrix[:, 1:10:end]), alpha=0.5, legend=false) -``` - -Look at that! Things are working; amazin'. - -We can also exploit [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl)'s parallel sampling capabilities: - -```{julia} -# Run separate 4 chains for 10 000 iterations using threads to parallelize. -num_chains = 4 -samples = sample( - model_with_grad, - sampler, - MCMCThreads(), - 10_000, - num_chains; - # Note we need to provide an initial state for every chain. - initial_state=fill(state, num_chains), - progress=false -) -samples_array = stack(map(Base.Fix1(stack, sample -> sample.x), samples)) -``` - -But the fact that we have to provide the `AbstractMCMC.sample` call, etc. with an `initial_state` to get started is a bit annoying. We can avoid this by also defining a `AbstractMCMC.step` *without* the `state` argument: - -```{julia} -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model_wrapper::AbstractMCMC.LogDensityModel, - ::MALA; - # NOTE: No state provided! - kwargs... -) - model = model_wrapper.logdensity - # Let's just create the initial state by sampling using a Gaussian. - x = randn(rng, LogDensityProblems.dimension(model)) - - return MALASample(x), MALAState(x) -end -``` - -Equipped with this, we no longer need to provide the `initial_state` everywhere: - -```{julia} -samples = sample(model_with_grad, sampler, 10_000; progress=false) -samples_matrix = stack(sample -> sample.x, samples) -hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) -``` - -## Using our sampler with Turing.jl - -As we promised, all of this hassle of implementing our `MALA` sampler in a way that uses [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) and [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) gets us something more than *just* an "automatic" implementation of `AbstractMCMC.sample`. - -It also enables use with Turing.jl through the `externalsampler`, but we need to do one final thing first: we need to tell Turing.jl how to extract a vector of parameters from the "sample" returned in our implementation of `AbstractMCMC.step`. In our case, the "sample" is a `MALASample`, so we just need the following line: - -```{julia} -# Load Turing.jl. -using Turing - -# Overload the `getparams` method for our "sample" type, which is just a vector. -Turing.Inference.getparams(::Turing.Model, sample::MALASample) = sample.x -``` - -And with that, we're good to go! - -```{julia} -# Our previous model defined as a Turing.jl model. -@model mvnormal_model() = x ~ MvNormal([-5., 0., 5.], I) -# Instantiate our model. -turing_model = mvnormal_model() -# Call `sample` but now we're passing in a Turing.jl `model` and wrapping -# our `MALA` sampler in the `externalsampler` to tell Turing.jl that the sampler -# expects something that implements LogDensityProblems.jl. -chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) -``` - -Pretty neat, eh? - -### Models with constrained parameters - -One thing we've sort of glossed over in all of the above is that MALA, at least how we've implemented it, requires $x$ to live in $\mathbb{R}^d$ for some $d > 0$. If some of the parameters were in fact constrained, e.g. we were working with a `Beta` distribution which has support on the interval $(0, 1)$, *not* on $\mathbb{R}^d$, we could easily end up outside of the valid range $(0, 1)$. - -```{julia} -@model beta_model() = x ~ Beta(3, 3) -turing_model = beta_model() -chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) -``` - -Yep, that still works, but only because Turing.jl actually *transforms* the `turing_model` from constrained to unconstrained, so that the `sampler` provided to `externalsampler` is actually always working in unconstrained space! This is not always desirable, so we can turn this off: - -```{julia} -chain = sample(turing_model, externalsampler(sampler; unconstrained=false), 10_000; progress=false) -``` - -The fun thing is that this still sort of works because - -```{julia} -logpdf(Beta(3, 3), 10.0) -``` - -and so the samples that fall outside of the range are always rejected. But do notice how much worse all the diagnostics are, e.g. `ess_tail` is very poor compared to when we use `unconstrained=true`. Moreover, in more complex cases this won't just result in a "nice" `-Inf` log-density value, but instead will error: - -```{julia} -#| error: true -@model function demo() - σ² ~ truncated(Normal(), lower=0) - # If we end up with negative values for `σ²`, the `Normal` will error. - x ~ Normal(0, σ²) -end -sample(demo(), externalsampler(sampler; unconstrained=false), 10_000; progress=false) -``` - -As expected, we run into a `DomainError` at some point, while if we set `unconstrained=true`, letting Turing.jl transform the model to a unconstrained form behind the scenes, everything works as expected: - -```{julia} -sample(demo(), externalsampler(sampler; unconstrained=true), 10_000; progress=false) -``` - -Neat! - -Similarly, which automatic differentiation backend one should use can be specified through the `adtype` keyword argument too. For example, if we want to use [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) instead of the default [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl): - -```{julia} -using ReverseDiff: ReverseDiff -# Specify that we want to use `AutoReverseDiff`. -sample( - demo(), - externalsampler(sampler; unconstrained=true, adtype=AutoReverseDiff()), - 10_000; - progress=false -) -``` - -Double-neat. - -## Summary - -At this point it's worth maybe reminding ourselves what we did and also *why* we did it: - -1. We define our models in the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface because it makes the sampler agnostic to how the underlying model is implemented. -2. We implement our sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) interface, which just means that our sampler is a subtype of `AbstractMCMC.AbstractSampler` and we implement the MCMC transition in `AbstractMCMC.step`. -3. Points 1 and 2 makes it so our sampler can be used with a wide range of model implementations, amongst them being models implemented in both Turing.jl and Stan. This gives you, the inference implementer, a large collection of models to test your inference method on, in addition to allowing users of Turing.jl and Stan to try out your inference method with minimal effort. - -[^1]: There is no such thing as a proper interface in Julia (at least not officially), and so we use the word "interface" here to mean a few minimal methods that needs to be implemented by any type that we treat as a target model. - -[^2]: We're going with the leapfrog formulation because in a future version of this tutorial we'll add a section extending this simple "baseline" MALA sampler to more complex versions. See [issue #479](https://github.com/TuringLang/docs/issues/479) for progress on this. +--- +title: Implementing Samplers +engine: julia +julia: + exeflags: ["--project=@.", "-t 4"] +aliases: + - ../../tutorials/docs-17-implementing-samplers/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this tutorial, we'll go through step-by-step how to implement a "simple" sampler in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) in such a way that it can be easily applied to Turing.jl models. + +In particular, we're going to implement a version of **Metropolis-adjusted Langevin (MALA)**. + +Note that we will implement this sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) framework, completely "ignoring" Turing.jl until the very end of the tutorial, at which point we'll use a single line of code to make the resulting sampler available to Turing.jl. This is to really drive home the point that one can implement samplers in a way that is accessible to all of Turing.jl's users without having to use Turing.jl yourself. + + +## Quick overview of MALA + +We can view MALA as a single step of the leapfrog intergrator with resampling of momentum $p$ at every step.[^2] To make that statement a bit more concrete, we first define the *extended* target $\bar{\gamma}(x, p)$ as + +\begin{equation*} +\log \bar{\gamma}(x, p) \propto \log \gamma(x) + \log \gamma_{\mathcal{N}(0, M)}(p) +\end{equation*} + +where $\gamma_{\mathcal{N}(0, M)}$ denotes the density for a zero-centered Gaussian with covariance matrix $M$. +We then consider targeting this joint distribution over both $x$ and $p$ as follows. +First we define the map + +\begin{equation*} +\begin{split} + L_{\epsilon}: \quad & \mathbb{R}^d \times \mathbb{R}^d \to \mathbb{R}^d \times \mathbb{R}^d \\ + & (x, p) \mapsto (\tilde{x}, \tilde{p}) := L_{\epsilon}(x, p) +\end{split} +\end{equation*} + +as + +\begin{equation*} +\begin{split} + p_{1 / 2} &:= p + \frac{\epsilon}{2} \nabla \log \gamma(x) \\ + \tilde{x} &:= x + \epsilon M^{-1} p_{1 /2 } \\ + p_1 &:= p_{1 / 2} + \frac{\epsilon}{2} \nabla \log \gamma(\tilde{x}) \\ + \tilde{p} &:= - p_1 +\end{split} +\end{equation*} + +This might be familiar for some readers as a single step of the Leapfrog integrator. +We then define the MALA kernel as follows: given the current iterate $x_i$, we sample the next iterate $x_{i + 1}$ as + +\begin{equation*} +\begin{split} + p &\sim \mathcal{N}(0, M) \\ + (\tilde{x}, \tilde{p}) &:= L_{\epsilon}(x_i, p) \\ + \alpha &:= \min \left\{ 1, \frac{\bar{\gamma}(\tilde{x}, \tilde{p})}{\bar{\gamma}(x_i, p)} \right\} \\ + x_{i + 1} &:= + \begin{cases} + \tilde{x} \quad & \text{ with prob. } \alpha \\ + x_i \quad & \text{ with prob. } 1 - \alpha + \end{cases} +\end{split} +\end{equation*} + +i.e. we accept the proposal $\tilde{x}$ with probability $\alpha$ and reject it, thus sticking with our current iterate, with probability $1 - \alpha$. + +## What we need from a model: [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) + +There are a few things we need from the "target" / "model" / density that we want to sample from: + +1. We need access to log-density *evaluations* $\log \gamma(x)$ so we can compute the acceptance ratio involving $\log \bar{\gamma}(x, p)$. +2. We need access to log-density *gradients* $\nabla \log \gamma(x)$ so we can compute the Leapfrog steps $L_{\epsilon}(x, p)$. +3. We also need access to the "size" of the model so we can determine the size of $M$. + +Luckily for us, there is a package called [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) which provides an interface for *exactly* this! + +To demonstrate how one can implement the "[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface"[^1] we will use a simple Gaussian model as an example: + +```{julia} +using LogDensityProblems: LogDensityProblems; + +# Let's define some type that represents the model. +struct IsotropicNormalModel{M<:AbstractVector{<:Real}} + "mean of the isotropic Gaussian" + mean::M +end + +# Specifies what input length the model expects. +LogDensityProblems.dimension(model::IsotropicNormalModel) = length(model.mean) +# Implementation of the log-density evaluation of the model. +function LogDensityProblems.logdensity(model::IsotropicNormalModel, x::AbstractVector{<:Real}) + return - sum(abs2, x .- model.mean) / 2 +end +``` + +This gives us all of the properties we want for our MALA sampler with the exception of the computation of the *gradient* $\nabla \log \gamma(x)$. There is the method `LogDensityProblems.logdensity_and_gradient` which should return a 2-tuple where the first entry is the evaluation of the logdensity $\log \gamma(x)$ and the second entry is the gradient $\nabla \log \gamma(x)$. + +There are two ways to "implement" this method: 1) we implement it by hand, which is feasible in the case of our `IsotropicNormalModel`, or b) we defer the implementation of this to a automatic differentiation backend. + +To implement it by hand we can simply do + +```{julia} +# Tell LogDensityProblems.jl that first-order, i.e. gradient information, is available. +LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{1}() + +# Implement `logdensity_and_gradient`. +function LogDensityProblems.logdensity_and_gradient(model::IsotropicNormalModel, x) + logγ_x = LogDensityProblems.logdensity(model, x) + ∇logγ_x = -x .* (x - model.mean) + return logγ_x, ∇logγ_x +end +``` + +Let's just try it out: + +```{julia} +# Instantiate the problem. +model = IsotropicNormalModel([-5., 0., 5.]) +# Create some example input that we can test on. +x_example = randn(LogDensityProblems.dimension(model)) +# Evaluate! +LogDensityProblems.logdensity(model, x_example) +``` + +To defer it to an automatic differentiation backend, we can do + +```{julia} +# Tell LogDensityProblems.jl we only have access to 0-th order information. +LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{0}() + +# Use `LogDensityProblemsAD`'s `ADgradient` in combination with some AD backend to implement `logdensity_and_gradient`. +using LogDensityProblemsAD, ADTypes, ForwardDiff +model_with_grad = ADgradient(AutoForwardDiff(), model) +LogDensityProblems.logdensity(model_with_grad, x_example) +``` + +We'll continue with the second approach in this tutorial since this is typically what one does in practice, because there are better hobbies to spend time on than deriving gradients by hand. + +At this point, one might wonder how we're going to tie this back to Turing.jl in the end. Effectively, when working with inference methods that only require log-density evaluations and / or higher-order information of the log-density, Turing.jl actually converts the user-provided `Model` into an object implementing the above methods for [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl). As a result, most samplers provided by Turing.jl are actually implemented to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), enabling their use both *within* Turing.jl and *outside* of Turing.jl! Morever, there exists similar conversions for Stan through BridgeStan and Stan[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), which means that a sampler supporting the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface can easily be used on both Turing.jl *and* Stan models (in addition to user-provided models, as our `IsotropicNormalModel` above)! + +Anyways, let's move on to actually implementing the sampler. + +## Implementing MALA in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) + +Now that we've established that a model implementing the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface provides us with all the information we need from $\log \gamma(x)$, we can address the question: given an object that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, how can we define a sampler for it? + +We're going to do this by making our sampler a sub-type of `AbstractMCMC.AbstractSampler` in addition to implementing a few methods from [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl). Why? Because it gets us *a lot* of functionality for free, as we will see later. + +Moreover, [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) provides a very natural interface for MCMC algorithms. + +First, we'll define our `MALA` type + +```{julia} +using AbstractMCMC + +struct MALA{T,A} <: AbstractMCMC.AbstractSampler + "stepsize used in the leapfrog step" + ϵ_init::T + "covariance matrix used for the momentum" + M_init::A +end +``` + +Notice how we've added the suffix `_init` to both the stepsize and the covariance matrix. We've done this because a `AbstractMCMC.AbstractSampler` should be *immutable*. Of course there might be many scenarios where we want to allow something like the stepsize and / or the covariance matrix to vary between iterations, e.g. during the burn-in / adaptation phase of the sampling process we might want to adjust the parameters using statistics computed from these initial iterations. But information which can change between iterations *should not go in the sampler itself*! Instead, this information should go in the sampler *state*. + +The sampler state should at the very least contain all the necessary information to perform the next MCMC iteration, but usually contains further information, e.g. quantities and statistics useful for evaluating whether the sampler has converged. + +We will use the following sampler state for our `MALA` sampler: + +```{julia} +struct MALAState{A<:AbstractVector{<:Real}} + "current position" + x::A +end +``` + +This might seem overly redundant: we're defining a type `MALAState` and it only contains a simple vector of reals. +In this particular case we indeed could have dropped this and simply used a `AbstractVector{<:Real}` as our sampler state, but typically, as we will see later, one wants to include other quantities in the sampler state. +For example, if we also wanted to adapt the parameters of our `MALA`, e.g. alter the stepsize depending on acceptance rates, in which case we should also put `ϵ` in the state, but for now we'll keep things simple. + +Moreover, we also want a _sample_ type, which is a type meant for "public consumption", i.e. the end-user. This is generally going to contain a subset of the information present in the state. But in such a simple scenario as this, we similarly only have a `AbstractVector{<:Real}`: + +```{julia} +struct MALASample{A<:AbstractVector{<:Real}} + "current position" + x::A +end +``` + +We currently have three things: + +1. A `AbstractMCMC.AbstractSampler` implementation called `MALA`. +2. A state `MALAState` for our sampler `MALA`. +3. A sample `MALASample` for our sampler `MALA`. + +That means that we're ready to implement the only thing that really matters: `AbstractMCMC.step`. + +`AbstractMCMC.step` defines the MCMC iteration of our `MALA` given the current `MALAState`. Specifically, the signature of the function is as follows: + +```{julia} +#| eval: false +function AbstractMCMC.step( + # The RNG to ensure reproducibility. + rng::Random.AbstractRNG, + # The model that defines our target. + model::AbstractMCMC.AbstractModel, + # The sampler for which we're taking a `step`. + sampler::AbstractMCMC.AbstractSampler, + # The current sampler `state`. + state; + # Additional keyword arguments that we may or may not need. + kwargs... +) +``` + +Moreover, there is a specific `AbstractMCMC.AbstractModel` which is used to indicate that the model that is provided implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface: `AbstractMCMC.LogDensityModel`. + +Since, as we discussed earlier, in our case we're indeed going to work with types that support the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, we'll define `AbstractMCMC.step` for such a `AbstractMCMC.LogDensityModel`. + +Note that `AbstractMCMC.LogDensityModel` has no other purpose; it has a single field called `logdensity`, and it does nothing else. But by wrapping the model in `AbstractMCMC.LogDensityModel`, it allows samplers that want to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) to define their `AbstractMCMC.step` on this type without running into method ambiguities. + +All in all, that means that the signature for our `AbstractMCMC.step` is going to be the following: + +```{julia} +#| eval: false +function AbstractMCMC.step( + rng::Random.AbstractRNG, + # `LogDensityModel` so we know we're working with LogDensityProblems.jl model. + model::AbstractMCMC.LogDensityModel, + # Our sampler. + sampler::MALA, + # Our sampler state. + state::MALAState; + kwargs... +) +``` + +Great! Now let's actually implement the full `AbstractMCMC.step` for our `MALA`. + +Let's remind ourselves what we're going to do: + +1. Sample a new momentum $p$. +2. Compute the log-density of the extended target $\log \bar{\gamma}(x, p)$. +3. Take a single leapfrog step $(\tilde{x}, \tilde{p}) = L_{\epsilon}(x, p)$. +4. Accept or reject the proposed $(\tilde{x}, \tilde{p})$. + +All in all, this results in the following: + +```{julia} +using Random: Random +using Distributions # so we get the `MvNormal` + +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model_wrapper::AbstractMCMC.LogDensityModel, + sampler::MALA, + state::MALAState; + kwargs... +) + # Extract the wrapped model which implements LogDensityProblems.jl. + model = model_wrapper.logdensity + # Let's just extract the sampler parameters to make our lives easier. + ϵ = sampler.ϵ_init + M = sampler.M_init + # Extract the current parameters. + x = state.x + # Sample the momentum. + p_dist = MvNormal(zeros(LogDensityProblems.dimension(model)), M) + p = rand(rng, p_dist) + # Propose using a single leapfrog step. + x̃, p̃ = leapfrog_step(model, x, p, ϵ, M) + # Accept or reject proposal. + logp = LogDensityProblems.logdensity(model, x) + logpdf(p_dist, p) + logp̃ = LogDensityProblems.logdensity(model, x̃) + logpdf(p_dist, p̃) + logα = logp̃ - logp + state_new = if log(rand(rng)) < logα + # Accept. + MALAState(x̃) + else + # Reject. + MALAState(x) + end + # Return the "sample" and the sampler state. + return MALASample(state_new.x), state_new +end +``` + +Fairly straight-forward. + +Of course, we haven't defined the `leapfrog_step` method yet, so let's do that: + +```{julia} +function leapfrog_step(model, x, p, ϵ, M) + # Update momentum `p` using "position" `x`. + ∇logγ_x = last(LogDensityProblems.logdensity_and_gradient(model, x)) + p1 = p + (ϵ / 2) .* ∇logγ_x + # Update the "position" `x` using momentum `p1`. + x̃ = x + ϵ .* (M \ p1) + # Update momentum `p1` using position `x̃` + ∇logγ_x̃ = last(LogDensityProblems.logdensity_and_gradient(model, x̃)) + p2 = p1 + (ϵ / 2) .* ∇logγ_x̃ + # Flip momentum `p2`. + p̃ = -p2 + return x̃, p̃ +end +``` + +With all of this, we're technically ready to sample! + +```{julia} +using Random, LinearAlgebra + +rng = Random.default_rng() +sampler = MALA(1, I) +state = MALAState(zeros(LogDensityProblems.dimension(model))) + +x_next, state_next = AbstractMCMC.step( + rng, + AbstractMCMC.LogDensityModel(model), + sampler, + state +) +``` + +Great, it works! + +And I promised we would get quite some functionality for free if we implemented `AbstractMCMC.step`, and so we can now simply call `sample` to perform standard MCMC sampling: + +```{julia} +# Perform 1000 iterations with our `MALA` sampler. +samples = sample(model_with_grad, sampler, 10_000; initial_state=state, progress=false) +# Concatenate into a matrix. +samples_matrix = stack(sample -> sample.x, samples) +``` + +```{julia} +# Compute the marginal means and standard deviations. +hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) +``` + +Let's visualize the samples + +```{julia} +using StatsPlots +plot(transpose(samples_matrix[:, 1:10:end]), alpha=0.5, legend=false) +``` + +Look at that! Things are working; amazin'. + +We can also exploit [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl)'s parallel sampling capabilities: + +```{julia} +# Run separate 4 chains for 10 000 iterations using threads to parallelize. +num_chains = 4 +samples = sample( + model_with_grad, + sampler, + MCMCThreads(), + 10_000, + num_chains; + # Note we need to provide an initial state for every chain. + initial_state=fill(state, num_chains), + progress=false +) +samples_array = stack(map(Base.Fix1(stack, sample -> sample.x), samples)) +``` + +But the fact that we have to provide the `AbstractMCMC.sample` call, etc. with an `initial_state` to get started is a bit annoying. We can avoid this by also defining a `AbstractMCMC.step` *without* the `state` argument: + +```{julia} +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model_wrapper::AbstractMCMC.LogDensityModel, + ::MALA; + # NOTE: No state provided! + kwargs... +) + model = model_wrapper.logdensity + # Let's just create the initial state by sampling using a Gaussian. + x = randn(rng, LogDensityProblems.dimension(model)) + + return MALASample(x), MALAState(x) +end +``` + +Equipped with this, we no longer need to provide the `initial_state` everywhere: + +```{julia} +samples = sample(model_with_grad, sampler, 10_000; progress=false) +samples_matrix = stack(sample -> sample.x, samples) +hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) +``` + +## Using our sampler with Turing.jl + +As we promised, all of this hassle of implementing our `MALA` sampler in a way that uses [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) and [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) gets us something more than *just* an "automatic" implementation of `AbstractMCMC.sample`. + +It also enables use with Turing.jl through the `externalsampler`, but we need to do one final thing first: we need to tell Turing.jl how to extract a vector of parameters from the "sample" returned in our implementation of `AbstractMCMC.step`. In our case, the "sample" is a `MALASample`, so we just need the following line: + +```{julia} +# Load Turing.jl. +using Turing + +# Overload the `getparams` method for our "sample" type, which is just a vector. +Turing.Inference.getparams(::Turing.Model, sample::MALASample) = sample.x +``` + +And with that, we're good to go! + +```{julia} +# Our previous model defined as a Turing.jl model. +@model mvnormal_model() = x ~ MvNormal([-5., 0., 5.], I) +# Instantiate our model. +turing_model = mvnormal_model() +# Call `sample` but now we're passing in a Turing.jl `model` and wrapping +# our `MALA` sampler in the `externalsampler` to tell Turing.jl that the sampler +# expects something that implements LogDensityProblems.jl. +chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) +``` + +Pretty neat, eh? + +### Models with constrained parameters + +One thing we've sort of glossed over in all of the above is that MALA, at least how we've implemented it, requires $x$ to live in $\mathbb{R}^d$ for some $d > 0$. If some of the parameters were in fact constrained, e.g. we were working with a `Beta` distribution which has support on the interval $(0, 1)$, *not* on $\mathbb{R}^d$, we could easily end up outside of the valid range $(0, 1)$. + +```{julia} +@model beta_model() = x ~ Beta(3, 3) +turing_model = beta_model() +chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) +``` + +Yep, that still works, but only because Turing.jl actually *transforms* the `turing_model` from constrained to unconstrained, so that the `sampler` provided to `externalsampler` is actually always working in unconstrained space! This is not always desirable, so we can turn this off: + +```{julia} +chain = sample(turing_model, externalsampler(sampler; unconstrained=false), 10_000; progress=false) +``` + +The fun thing is that this still sort of works because + +```{julia} +logpdf(Beta(3, 3), 10.0) +``` + +and so the samples that fall outside of the range are always rejected. But do notice how much worse all the diagnostics are, e.g. `ess_tail` is very poor compared to when we use `unconstrained=true`. Moreover, in more complex cases this won't just result in a "nice" `-Inf` log-density value, but instead will error: + +```{julia} +#| error: true +@model function demo() + σ² ~ truncated(Normal(), lower=0) + # If we end up with negative values for `σ²`, the `Normal` will error. + x ~ Normal(0, σ²) +end +sample(demo(), externalsampler(sampler; unconstrained=false), 10_000; progress=false) +``` + +As expected, we run into a `DomainError` at some point, while if we set `unconstrained=true`, letting Turing.jl transform the model to a unconstrained form behind the scenes, everything works as expected: + +```{julia} +sample(demo(), externalsampler(sampler; unconstrained=true), 10_000; progress=false) +``` + +Neat! + +Similarly, which automatic differentiation backend one should use can be specified through the `adtype` keyword argument too. For example, if we want to use [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) instead of the default [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl): + +```{julia} +using ReverseDiff: ReverseDiff +# Specify that we want to use `AutoReverseDiff`. +sample( + demo(), + externalsampler(sampler; unconstrained=true, adtype=AutoReverseDiff()), + 10_000; + progress=false +) +``` + +Double-neat. + +## Summary + +At this point it's worth maybe reminding ourselves what we did and also *why* we did it: + +1. We define our models in the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface because it makes the sampler agnostic to how the underlying model is implemented. +2. We implement our sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) interface, which just means that our sampler is a subtype of `AbstractMCMC.AbstractSampler` and we implement the MCMC transition in `AbstractMCMC.step`. +3. Points 1 and 2 makes it so our sampler can be used with a wide range of model implementations, amongst them being models implemented in both Turing.jl and Stan. This gives you, the inference implementer, a large collection of models to test your inference method on, in addition to allowing users of Turing.jl and Stan to try out your inference method with minimal effort. + +[^1]: There is no such thing as a proper interface in Julia (at least not officially), and so we use the word "interface" here to mean a few minimal methods that needs to be implemented by any type that we treat as a target model. + +[^2]: We're going with the leapfrog formulation because in a future version of this tutorial we'll add a section extending this simple "baseline" MALA sampler to more complex versions. See [issue #479](https://github.com/TuringLang/docs/issues/479) for progress on this. diff --git a/developers/inference/variational-inference/index.qmd b/developers/inference/variational-inference/index.qmd index 0965b07c7..d424b5ecc 100755 --- a/developers/inference/variational-inference/index.qmd +++ b/developers/inference/variational-inference/index.qmd @@ -1,385 +1,385 @@ ---- -title: Variational Inference -engine: julia -aliases: - - ../../tutorials/docs-07-for-developers-variational-inference/index.html ---- - -# Overview - -In this post, we'll examine variational inference (VI), a family of approximate Bayesian inference methods. We will focus on one of the more standard VI methods, Automatic Differentiation Variational Inference (ADVI). - -Here, we'll examine the theory behind VI, but if you're interested in using ADVI in Turing, [check out this tutorial]({{}}). - -# Motivation - -In Bayesian inference, one usually specifies a model as follows: given data $\\{x_i\\}_{i = 1}^n$, - -::: {.column-page} -$$ -\begin{align*} - \text{prior:} \quad z &\sim p(z) \\ - \text{likelihood:} \quad x_i &\overset{\text{i.i.d.}}{\sim} p(x \mid z) \quad \text{where} \quad i = 1, \dots, n -\end{align*} -$$ -::: - -where $\overset{\text{i.i.d.}}{\sim}$ denotes that the samples are identically independently distributed. Our goal in Bayesian inference is then to find the _posterior_ - -::: {.column-page} -$$ -p(z \mid \\{ x\_i \\}\_{i = 1}^n) \propto p(z) \prod\_{i=1}^{n} p(x\_i \mid z). -$$ -::: - -In general, one cannot obtain a closed form expression for $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$, but one might still be able to _sample_ from $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ with guarantees of converging to the target posterior $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ as the number of samples go to $\infty$, e.g. MCMC. - -As you are hopefully already aware, Turing.jl provides many methods with asymptotic exactness guarantees that we can apply to such a problem! - -Unfortunately, these unbiased samplers can be prohibitively expensive to run. As the model $p$ increases in complexity, the convergence of these unbiased samplers can slow down dramatically. Still, in the _infinite_ limit, these methods should converge to the true posterior! But infinity is fairly large, like, _at least_ more than 12, so this might take a while. - -In such a case, it might be desirable to sacrifice some of these asymptotic guarantees and instead _approximate_ the posterior $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using some other model which we'll denote $q(z)$. - -There are multiple approaches to take in this case, one of which is **variational inference (VI)**. - -# Variational Inference (VI) - -In VI, we're looking to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n )$ using some _approximate_ or _variational_ posterior $q(z)$. - -To approximate something you need a notion of what "close" means. In the context of probability densities a standard such "measure" of closeness is the _Kullback-Leibler (KL) divergence_ , though this is far from the only one. The KL-divergence is defined between two densities $q(z)$ and $p(z \mid \\{ x_i \\}_{i = 1}^n)$ as - -::: {.column-page} -$$ -\begin{align*} - \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \int \log \left( \frac{q(z)}{\prod\_{i = 1}^n p(z \mid x\_i)} \right) q(z) \mathrm{d}{z} \\\\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) - \sum\_{i = 1}^n \log p(z \mid x\_i) \right] \\\\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(z \mid x\_i) \right]. -\end{align*} -$$ -::: - -It's worth noting that unfortunately the KL-divergence is _not_ a metric/distance in the analysis-sense due to its lack of symmetry. On the other hand, it turns out that minimizing the KL-divergence that it's actually equivalent to maximizing the log-likelihood! Also, under reasonable restrictions on the densities at hand, - -::: {.column-page} -$$ -\mathrm{D\_{KL}}\left(q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = 0 \quad \iff \quad q(z) = p(z \mid \\{ x\_i \\}\_{i = 1}^n), \quad \forall z. -$$ -::: - -Therefore one could (and we will) attempt to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using a density $q(z)$ by minimizing the KL-divergence between these two! - -One can also show that $\mathrm{D_{KL}} \ge 0$, which we'll need later. Finally notice that the KL-divergence is only well-defined when in fact $q(z)$ is zero everywhere $p(z \mid \\{ x_i \\}_{i = 1}^n)$ is zero, i.e. - -::: {.column-page} -$$ -\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right). -$$ -::: - -Otherwise, there might be a point $z_0 \sim q(z)$ such that $p(z_0 \mid \\{ x_i \\}_{i = 1}^n) = 0$, resulting in $\log\left(\frac{q(z)}{0}\right)$ which doesn't make sense! - -One major problem: as we can see in the definition of the KL-divergence, we need $p(z \mid \\{ x_i \\}_{i = 1}^n)$ for any $z$ if we want to compute the KL-divergence between this and $q(z)$. We don't have that. The entire reason we even do Bayesian inference is that we don't know the posterior! Cleary this isn't going to work. _Or is it?!_ - -## Computing KL-divergence without knowing the posterior - -First off, recall that - -::: {.column-page} -$$ -p(z \mid x\_i) = \frac{p(x\_i, z)}{p(x\_i)} -$$ -::: - -so we can write - -::: {.column-page} -$$ -\begin{align*} -\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) - \log p(x\_i) \right] \\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x_i) \right] \\ - &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \log p(x\_i), -\end{align*} -$$ -::: - -where in the last equality we used the fact that $p(x_i)$ is independent of $z$. - -Now you're probably thinking "Oh great! Now you've introduced $p(x_i)$ which we _also_ can't compute (in general)!". Woah. Calm down human. Let's do some more algebra. The above expression can be rearranged to - -::: {.column-page} -$$ -\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) + \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right]}\_{=: \mathrm{ELBO}(q)} = \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i) \right]}\_{\text{constant}}. -$$ -::: - -See? The left-hand side is _constant_ and, as we mentioned before, $\mathrm{D_{KL}} \ge 0$. What happens if we try to _maximize_ the term we just gave the completely arbitrary name $\mathrm{ELBO}$? Well, if $\mathrm{ELBO}$ goes up while $p(x_i)$ stays constant then $\mathrm{D_{KL}}$ _has to_ go down! That is, the $q(z)$ which _minimizes_ the KL-divergence is the same $q(z)$ which _maximizes_ $\mathrm{ELBO}(q)$: - -::: {.column-page} -$$ -\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = \underset{q}{\mathrm{argmax}} \ \mathrm{ELBO}(q) -$$ -::: - -where - -::: {.column-page} -$$ -\begin{align*} -\mathrm{ELBO}(q) &:= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] \\ - &= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) + \mathbb{H}\left( q(z) \right) -\end{align*} -$$ -::: - -and $\mathbb{H} \left(q(z) \right)$ denotes the [(differential) entropy](https://www.wikiwand.com/en/Differential_entropy) of $q(z)$. - -Assuming joint $p(x_i, z)$ and the entropy $\mathbb{H}\left(q(z)\right)$ are both tractable, we can use a Monte-Carlo for the remaining expectation. This leaves us with the following tractable expression - -::: {.column-page} -$$ -\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) \approx \underset{q}{\mathrm{argmax}} \ \widehat{\mathrm{ELBO}}(q) -$$ -::: - -where - -::: {.column-page} -$$ -\widehat{\mathrm{ELBO}}(q) = \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \log p(x\_i, z\_k) \right) + \mathbb{H} \left(q(z)\right) \quad \text{where} \quad z\_k \sim q(z) \quad \forall k = 1, \dots, m. -$$ -::: - -Hence, as long as we can sample from $q(z)$ somewhat efficiently, we can indeed minimize the KL-divergence! Neat, eh? - -Sidenote: in the case where $q(z)$ is tractable but $\mathbb{H} \left(q(z) \right)$ is _not_ , we can use an Monte-Carlo estimate for this term too but this generally results in a higher-variance estimate. - -Also, I fooled you real good: the ELBO _isn't_ an arbitrary name, hah! In fact it's an abbreviation for the **expected lower bound (ELBO)** because it, uhmm, well, it's the _expected_ lower bound (remember $\mathrm{D_{KL}} \ge 0$). Yup. - -## Maximizing the ELBO - -Finding the optimal $q$ over _all_ possible densities of course isn't feasible. Instead we consider a family of _parameterized_ densities $\mathscr{D}\_{\Theta}$ where $\Theta$ denotes the space of possible parameters. Each density in this family $q\_{\theta} \in \mathscr{D}\_{\Theta}$ is parameterized by a unique $\theta \in \Theta$. Moreover, we'll assume - - 1. $q\_{\theta}(z)$, i.e. evaluating the probability density $q$ at any point $z$, is differentiable - 2. $z \sim q\_{\theta}(z)$, i.e. the process of sampling from $q\_{\theta}(z)$, is differentiable - -(1) is fairly straight-forward, but (2) is a bit tricky. What does it even mean for a _sampling process_ to be differentiable? This is quite an interesting problem in its own right and would require something like a [50-page paper to properly review the different approaches (highly recommended read)](https://arxiv.org/abs/1906.10652). - -We're going to make use of a particular such approach which goes under a bunch of different names: _reparametrization trick_, _path derivative_, etc. This refers to making the assumption that all elements $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ can be considered as reparameterizations of some base density, say $\bar{q}(z)$. That is, if $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ then - -::: {.column-page} -$$ -z \sim q\_{\theta}(z) \quad \iff \quad z := g\_{\theta}(\tilde{z}) \quad \text{where} \quad \bar{z} \sim \bar{q}(z) -$$ -::: - -for some function $g\_{\theta}$ differentiable wrt. $\theta$. So all $q_{\theta} \in \mathscr{Q}\_{\Theta}$ are using the *same* reparameterization-function $g$ but each $q\_{\theta}$ correspond to different choices of $\theta$ for $f\_{\theta}$. - -Under this assumption we can differentiate the sampling process by taking the derivative of $g\_{\theta}$ wrt. $\theta$, and thus we can differentiate the entire $\widehat{\mathrm{ELBO}}(q\_{\theta})$ wrt. $\theta$! With the gradient available we can either try to solve for optimality either by setting the gradient equal to zero or maximize $\widehat{\mathrm{ELBO}}(q\_{\theta})$ stepwise by traversing $\mathscr{Q}\_{\Theta}$ in the direction of steepest ascent. For the sake of generality, we're going to go with the stepwise approach. - -With all this nailed down, we eventually reach the section on **Automatic Differentiation Variational Inference (ADVI)**. - -## Automatic Differentiation Variational Inference (ADVI) - -So let's revisit the assumptions we've made at this point: - - 1. The variational posterior $q\_{\theta}$ is in a parameterized family of densities denoted $\mathscr{Q}\_{\Theta}$, with $\theta \in \Theta$. - - 2. $\mathscr{Q}\_{\Theta}$ is a space of _reparameterizable_ densities with $\bar{q}(z)$ as the base-density. - - 3. The parameterization function $g\_{\theta}$ is differentiable wrt. $\theta$. - - 4. Evaluation of the probability density $q\_{\theta}(z)$ is differentiable wrt. $\theta$. - - 5. $\mathbb{H}\left(q\_{\theta}(z)\right)$ is tractable. - - 6. Evaluation of the joint density $p(x, z)$ is tractable and differentiable wrt. $z$ - - 7. The support of $q(z)$ is a subspace of the support of $p(z \mid x)$ : $\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right)$. - -All of these are not *necessary* to do VI, but they are very convenient and results in a fairly flexible approach. One distribution which has a density satisfying all of the above assumptions _except_ (7) (we'll get back to this in second) for any tractable and differentiable $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ is the good ole' Gaussian/normal distribution: - -::: {.column-page} -$$ -z \sim \mathcal{N}(\mu, \Sigma) \quad \iff \quad z = g\_{\mu, L}(\bar{z}) := \mu + L^T \tilde{z} \quad \text{where} \quad \bar{z} \sim \bar{q}(z) := \mathcal{N}(1\_d, I\_{d \times d}) -$$ -::: - -where $\Sigma = L L^T,$ with $L$ obtained from the Cholesky-decomposition. Abusing notation a bit, we're going to write - -::: {.column-page} -$$ -\theta = (\mu, \Sigma) := (\mu\_1, \dots, \mu\_d, L\_{11}, \dots, L\_{1, d}, L\_{2, 1}, \dots, L\_{2, d}, \dots, L\_{d, 1}, \dots, L\_{d, d}). -$$ -::: - -With this assumption we finally have a tractable expression for $\widehat{\mathrm{ELBO}}(q_{\mu, \Sigma})$! Well, assuming (7) is holds. Since a Gaussian has non-zero probability on the entirety of $\mathbb{R}^d$, we also require $p(z \mid \\{ x_i \\}_{i = 1}^n)$ to have non-zero probability on all of $\mathbb{R}^d$. - -Though not necessary, we'll often make a *mean-field* assumption for the variational posterior $q(z)$, i.e. assume independence between the latent variables. In this case, we'll write - -::: {.column-page} -$$ -\theta = (\mu, \sigma^2) := (\mu\_1, \dots, \mu\_d, \sigma\_1^2, \dots, \sigma\_d^2). -$$ -::: - -### Examples - -As a (trivial) example we could apply the approach described above to is the following generative model for $p(z \mid \\{ x_i \\}\_{i = 1}^n)$: - -::: {.column-page} -$$ -\begin{align*} - m &\sim \mathcal{N}(0, 1) \\ - x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, 1), \quad i = 1, \dots, n. -\end{align*} -$$ -::: - -In this case $z = m$ and we have the posterior defined $p(m \mid \\{ x\_i \\}\_{i = 1}^n) = p(m) \prod\_{i = 1}^n p(x\_i \mid m)$. Then the variational posterior would be - -::: {.column-page} -$$ -q\_{\mu, \sigma} = \mathcal{N}(\mu, \sigma^2), \quad \text{where} \quad \mu \in \mathbb{R}, \ \sigma^2 \in \mathbb{R}^{ + }. -$$ -::: - -And since prior of $m$, $\mathcal{N}(0, 1)$, has non-zero probability on the entirety of $\mathbb{R}$, same as $q(m)$, i.e. assumption (7) above holds, everything is fine and life is good. - -But what about this generative model for $p(z \mid \\{ x_i \\}_{i = 1}^n)$: - -::: {.column-page} -$$ -\begin{align*} - s &\sim \mathrm{InverseGamma}(2, 3), \\ - m &\sim \mathcal{N}(0, s), \\ - x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, s), \quad i = 1, \dots, n, -\end{align*} -$$ -::: - -with posterior $p(s, m \mid \\{ x\_i \\}\_{i = 1}^n) = p(s) p(m \mid s) \prod\_{i = 1}^n p(x\_i \mid s, m)$ and the mean-field variational posterior $q(s, m)$ will be - -::: {.column-page} -$$ -q\_{\mu\_1, \mu\_2, \sigma\_1^2, \sigma\_2^2}(s, m) = p\_{\mathcal{N}(\mu\_1, \sigma\_1^2)}(s)\ p\_{\mathcal{N}(\mu\_2, \sigma\_2^2)}(m), -$$ -::: - -where we've denoted the evaluation of the probability density of a Gaussian as $p_{\mathcal{N}(\mu, \sigma^2)}(x)$. - -Observe that $\mathrm{InverseGamma}(2, 3)$ has non-zero probability only on $\mathbb{R}^{ + } := (0, \infty)$ which is clearly not all of $\mathbb{R}$ like $q(s, m)$ has, i.e. - -::: {.column-page} -$$ -\mathrm{supp} \left( q(s, m) \right) \not\subseteq \mathrm{supp} \left( p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right). -$$ -::: - -Recall from the definition of the KL-divergence that when this is the case, the KL-divergence isn't well defined. This gets us to the *automatic* part of ADVI. - -### "Automatic"? How? - -For a lot of the standard (continuous) densities $p$ we can actually construct a probability density $\tilde{p}$ with non-zero probability on all of $\mathbb{R}$ by *transforming* the "constrained" probability density $p$ to $\tilde{p}$. In fact, in these cases this is a one-to-one relationship. As we'll see, this helps solve the support-issue we've been going on and on about. - -#### Transforming densities using change of variables - -If we want to compute the probability of $x$ taking a value in some set $A \subseteq \mathrm{supp} \left( p(x) \right)$, we have to integrate $p(x)$ over $A$, i.e. - -::: {.column-page} -$$ -\mathbb{P}_p(x \in A) = \int_A p(x) \mathrm{d}x. -$$ -::: - -This means that if we have a differentiable bijection $f: \mathrm{supp} \left( q(x) \right) \to \mathbb{R}^d$ with differentiable inverse $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(x) \right)$, we can perform a change of variables - -::: {.column-page} -$$ -\mathbb{P}\_p(x \in A) = \int\_{f^{-1}(A)} p \left(f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| \mathrm{d}y, -$$ -::: - -where $\mathcal{J}_{f^{-1}}(x)$ denotes the jacobian of $f^{-1}$ evaluated at $x$. Observe that this defines a probability distribution - -::: {.column-page} -$$ -\mathbb{P}\_{\tilde{p}}\left(y \in f^{-1}(A) \right) = \int\_{f^{-1}(A)} \tilde{p}(y) \mathrm{d}y, -$$ -::: - -since $f^{-1}\left(\mathrm{supp} (p(x)) \right) = \mathbb{R}^d$ which has probability 1. This probability distribution has *density* $\tilde{p}(y)$ with $\mathrm{supp} \left( \tilde{p}(y) \right) = \mathbb{R}^d$, defined - -::: {.column-page} -$$ -\tilde{p}(y) = p \left( f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| -$$ -::: - -or equivalently - -::: {.column-page} -$$ -\tilde{p} \left( f(x) \right) = \frac{p(x)}{\big| \det \mathcal{J}\_{f}(x) \big|} -$$ -::: - -due to the fact that - -::: {.column-page} -$$ -\big| \det \mathcal{J}\_{f^{-1}}(y) \big| = \big| \det \mathcal{J}\_{f}(x) \big|^{-1} -$$ -::: - -*Note: it's also necessary that the log-abs-det-jacobian term is non-vanishing. This can for example be accomplished by assuming $f$ to also be elementwise monotonic.* - -#### Back to VI - -So why is this is useful? Well, we're looking to generalize our approach using a normal distribution to cases where the supports don't match up. How about defining $q(z)$ by - -::: {.column-page} -$$ -\begin{align*} - \eta &\sim \mathcal{N}(\mu, \Sigma), \\\\ - z &= f^{-1}(\eta), -\end{align*} -$$ -::: - -where $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(z \mid x) \right)$ is a differentiable bijection with differentiable inverse. Then $z \sim q_{\mu, \Sigma}(z) \implies z \in \mathrm{supp} \left( p(z \mid x) \right)$ as we wanted. The resulting variational density is - -::: {.column-page} -$$ -q\_{\mu, \Sigma}(z) = p\_{\mathcal{N}(\mu, \Sigma)}\left( f(z) \right) \ \big| \det \mathcal{J}\_{f}(z) \big|. -$$ -::: - -Note that the way we've constructed $q(z)$ here is basically a reverse of the approach we described above. Here we sample from a distribution with support on $\mathbb{R}$ and transform *to* $\mathrm{supp} \left( p(z \mid x) \right)$. - -If we want to write the ELBO explicitly in terms of $\eta$ rather than $z$, the first term in the ELBO becomes - -::: {.column-page} -$$ -\begin{align*} - \mathbb{E}\_{z \sim q_{\mu, \Sigma}(z)} \left[ \log p(x\_i, z) \right] &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \Bigg[ \log \frac{p\left(x\_i, f^{-1}(\eta) \right)}{\big| \det \mathcal{J}_{f^{-1}}(\eta) \big|} \Bigg] \\ - &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \log p\left(x\_i, f^{-1}(\eta) \right) \right] - \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \left| \det \mathcal{J}\_{f^{-1}}(\eta) \right| \right]. -\end{align*} -$$ -::: - -The entropy is invariant under change of variables, thus $\mathbb{H} \left(q\_{\mu, \Sigma}(z)\right)$ is simply the entropy of the normal distribution which is known analytically. - -Hence, the resulting empirical estimate of the ELBO is - -::: {.column-page} -$$ -\begin{align*} -\widehat{\mathrm{ELBO}}(q\_{\mu, \Sigma}) &= \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \left(\log p\left(x\_i, f^{-1}(\eta_k)\right) - \log \big| \det \mathcal{J}\_{f^{-1}}(\eta\_k) \big| \right) \right) + \mathbb{H} \left(p\_{\mathcal{N}(\mu, \Sigma)}(z)\right) \\ -& \text{where} \quad z\_k \sim \mathcal{N}(\mu, \Sigma) \quad \forall k = 1, \dots, m -\end{align*}. -$$ -::: - -And maximizing this wrt. $\mu$ and $\Sigma$ is what's referred to as **Automatic Differentiation Variational Inference (ADVI)**! - -Now if you want to try it out, [check out the tutorial on how to use ADVI in Turing.jl]({{}})! +--- +title: Variational Inference +engine: julia +aliases: + - ../../tutorials/docs-07-for-developers-variational-inference/index.html +--- + +# Overview + +In this post, we'll examine variational inference (VI), a family of approximate Bayesian inference methods. We will focus on one of the more standard VI methods, Automatic Differentiation Variational Inference (ADVI). + +Here, we'll examine the theory behind VI, but if you're interested in using ADVI in Turing, [check out this tutorial]({{}}). + +# Motivation + +In Bayesian inference, one usually specifies a model as follows: given data $\\{x_i\\}_{i = 1}^n$, + +::: {.column-page} +$$ +\begin{align*} + \text{prior:} \quad z &\sim p(z) \\ + \text{likelihood:} \quad x_i &\overset{\text{i.i.d.}}{\sim} p(x \mid z) \quad \text{where} \quad i = 1, \dots, n +\end{align*} +$$ +::: + +where $\overset{\text{i.i.d.}}{\sim}$ denotes that the samples are identically independently distributed. Our goal in Bayesian inference is then to find the _posterior_ + +::: {.column-page} +$$ +p(z \mid \\{ x\_i \\}\_{i = 1}^n) \propto p(z) \prod\_{i=1}^{n} p(x\_i \mid z). +$$ +::: + +In general, one cannot obtain a closed form expression for $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$, but one might still be able to _sample_ from $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ with guarantees of converging to the target posterior $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ as the number of samples go to $\infty$, e.g. MCMC. + +As you are hopefully already aware, Turing.jl provides many methods with asymptotic exactness guarantees that we can apply to such a problem! + +Unfortunately, these unbiased samplers can be prohibitively expensive to run. As the model $p$ increases in complexity, the convergence of these unbiased samplers can slow down dramatically. Still, in the _infinite_ limit, these methods should converge to the true posterior! But infinity is fairly large, like, _at least_ more than 12, so this might take a while. + +In such a case, it might be desirable to sacrifice some of these asymptotic guarantees and instead _approximate_ the posterior $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using some other model which we'll denote $q(z)$. + +There are multiple approaches to take in this case, one of which is **variational inference (VI)**. + +# Variational Inference (VI) + +In VI, we're looking to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n )$ using some _approximate_ or _variational_ posterior $q(z)$. + +To approximate something you need a notion of what "close" means. In the context of probability densities a standard such "measure" of closeness is the _Kullback-Leibler (KL) divergence_ , though this is far from the only one. The KL-divergence is defined between two densities $q(z)$ and $p(z \mid \\{ x_i \\}_{i = 1}^n)$ as + +::: {.column-page} +$$ +\begin{align*} + \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \int \log \left( \frac{q(z)}{\prod\_{i = 1}^n p(z \mid x\_i)} \right) q(z) \mathrm{d}{z} \\\\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) - \sum\_{i = 1}^n \log p(z \mid x\_i) \right] \\\\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(z \mid x\_i) \right]. +\end{align*} +$$ +::: + +It's worth noting that unfortunately the KL-divergence is _not_ a metric/distance in the analysis-sense due to its lack of symmetry. On the other hand, it turns out that minimizing the KL-divergence that it's actually equivalent to maximizing the log-likelihood! Also, under reasonable restrictions on the densities at hand, + +::: {.column-page} +$$ +\mathrm{D\_{KL}}\left(q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = 0 \quad \iff \quad q(z) = p(z \mid \\{ x\_i \\}\_{i = 1}^n), \quad \forall z. +$$ +::: + +Therefore one could (and we will) attempt to approximate $p(z \mid \\{ x_i \\}_{i = 1}^n)$ using a density $q(z)$ by minimizing the KL-divergence between these two! + +One can also show that $\mathrm{D_{KL}} \ge 0$, which we'll need later. Finally notice that the KL-divergence is only well-defined when in fact $q(z)$ is zero everywhere $p(z \mid \\{ x_i \\}_{i = 1}^n)$ is zero, i.e. + +::: {.column-page} +$$ +\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right). +$$ +::: + +Otherwise, there might be a point $z_0 \sim q(z)$ such that $p(z_0 \mid \\{ x_i \\}_{i = 1}^n) = 0$, resulting in $\log\left(\frac{q(z)}{0}\right)$ which doesn't make sense! + +One major problem: as we can see in the definition of the KL-divergence, we need $p(z \mid \\{ x_i \\}_{i = 1}^n)$ for any $z$ if we want to compute the KL-divergence between this and $q(z)$. We don't have that. The entire reason we even do Bayesian inference is that we don't know the posterior! Cleary this isn't going to work. _Or is it?!_ + +## Computing KL-divergence without knowing the posterior + +First off, recall that + +::: {.column-page} +$$ +p(z \mid x\_i) = \frac{p(x\_i, z)}{p(x\_i)} +$$ +::: + +so we can write + +::: {.column-page} +$$ +\begin{align*} +\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) - \log p(x\_i) \right] \\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x_i) \right] \\ + &= \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] - \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] + \sum\_{i = 1}^n \log p(x\_i), +\end{align*} +$$ +::: + +where in the last equality we used the fact that $p(x_i)$ is independent of $z$. + +Now you're probably thinking "Oh great! Now you've introduced $p(x_i)$ which we _also_ can't compute (in general)!". Woah. Calm down human. Let's do some more algebra. The above expression can be rearranged to + +::: {.column-page} +$$ +\mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) + \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right]}\_{=: \mathrm{ELBO}(q)} = \underbrace{\sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i) \right]}\_{\text{constant}}. +$$ +::: + +See? The left-hand side is _constant_ and, as we mentioned before, $\mathrm{D_{KL}} \ge 0$. What happens if we try to _maximize_ the term we just gave the completely arbitrary name $\mathrm{ELBO}$? Well, if $\mathrm{ELBO}$ goes up while $p(x_i)$ stays constant then $\mathrm{D_{KL}}$ _has to_ go down! That is, the $q(z)$ which _minimizes_ the KL-divergence is the same $q(z)$ which _maximizes_ $\mathrm{ELBO}(q)$: + +::: {.column-page} +$$ +\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) = \underset{q}{\mathrm{argmax}} \ \mathrm{ELBO}(q) +$$ +::: + +where + +::: {.column-page} +$$ +\begin{align*} +\mathrm{ELBO}(q) &:= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) - \mathbb{E}\_{z \sim q(z)} \left[ \log q(z) \right] \\ + &= \left( \sum\_{i = 1}^n \mathbb{E}\_{z \sim q(z)} \left[ \log p(x\_i, z) \right] \right) + \mathbb{H}\left( q(z) \right) +\end{align*} +$$ +::: + +and $\mathbb{H} \left(q(z) \right)$ denotes the [(differential) entropy](https://www.wikiwand.com/en/Differential_entropy) of $q(z)$. + +Assuming joint $p(x_i, z)$ and the entropy $\mathbb{H}\left(q(z)\right)$ are both tractable, we can use a Monte-Carlo for the remaining expectation. This leaves us with the following tractable expression + +::: {.column-page} +$$ +\underset{q}{\mathrm{argmin}} \ \mathrm{D\_{KL}} \left( q(z), p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right) \approx \underset{q}{\mathrm{argmax}} \ \widehat{\mathrm{ELBO}}(q) +$$ +::: + +where + +::: {.column-page} +$$ +\widehat{\mathrm{ELBO}}(q) = \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \log p(x\_i, z\_k) \right) + \mathbb{H} \left(q(z)\right) \quad \text{where} \quad z\_k \sim q(z) \quad \forall k = 1, \dots, m. +$$ +::: + +Hence, as long as we can sample from $q(z)$ somewhat efficiently, we can indeed minimize the KL-divergence! Neat, eh? + +Sidenote: in the case where $q(z)$ is tractable but $\mathbb{H} \left(q(z) \right)$ is _not_ , we can use an Monte-Carlo estimate for this term too but this generally results in a higher-variance estimate. + +Also, I fooled you real good: the ELBO _isn't_ an arbitrary name, hah! In fact it's an abbreviation for the **expected lower bound (ELBO)** because it, uhmm, well, it's the _expected_ lower bound (remember $\mathrm{D_{KL}} \ge 0$). Yup. + +## Maximizing the ELBO + +Finding the optimal $q$ over _all_ possible densities of course isn't feasible. Instead we consider a family of _parameterized_ densities $\mathscr{D}\_{\Theta}$ where $\Theta$ denotes the space of possible parameters. Each density in this family $q\_{\theta} \in \mathscr{D}\_{\Theta}$ is parameterized by a unique $\theta \in \Theta$. Moreover, we'll assume + + 1. $q\_{\theta}(z)$, i.e. evaluating the probability density $q$ at any point $z$, is differentiable + 2. $z \sim q\_{\theta}(z)$, i.e. the process of sampling from $q\_{\theta}(z)$, is differentiable + +(1) is fairly straight-forward, but (2) is a bit tricky. What does it even mean for a _sampling process_ to be differentiable? This is quite an interesting problem in its own right and would require something like a [50-page paper to properly review the different approaches (highly recommended read)](https://arxiv.org/abs/1906.10652). + +We're going to make use of a particular such approach which goes under a bunch of different names: _reparametrization trick_, _path derivative_, etc. This refers to making the assumption that all elements $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ can be considered as reparameterizations of some base density, say $\bar{q}(z)$. That is, if $q\_{\theta} \in \mathscr{Q}\_{\Theta}$ then + +::: {.column-page} +$$ +z \sim q\_{\theta}(z) \quad \iff \quad z := g\_{\theta}(\tilde{z}) \quad \text{where} \quad \bar{z} \sim \bar{q}(z) +$$ +::: + +for some function $g\_{\theta}$ differentiable wrt. $\theta$. So all $q_{\theta} \in \mathscr{Q}\_{\Theta}$ are using the *same* reparameterization-function $g$ but each $q\_{\theta}$ correspond to different choices of $\theta$ for $f\_{\theta}$. + +Under this assumption we can differentiate the sampling process by taking the derivative of $g\_{\theta}$ wrt. $\theta$, and thus we can differentiate the entire $\widehat{\mathrm{ELBO}}(q\_{\theta})$ wrt. $\theta$! With the gradient available we can either try to solve for optimality either by setting the gradient equal to zero or maximize $\widehat{\mathrm{ELBO}}(q\_{\theta})$ stepwise by traversing $\mathscr{Q}\_{\Theta}$ in the direction of steepest ascent. For the sake of generality, we're going to go with the stepwise approach. + +With all this nailed down, we eventually reach the section on **Automatic Differentiation Variational Inference (ADVI)**. + +## Automatic Differentiation Variational Inference (ADVI) + +So let's revisit the assumptions we've made at this point: + + 1. The variational posterior $q\_{\theta}$ is in a parameterized family of densities denoted $\mathscr{Q}\_{\Theta}$, with $\theta \in \Theta$. + + 2. $\mathscr{Q}\_{\Theta}$ is a space of _reparameterizable_ densities with $\bar{q}(z)$ as the base-density. + + 3. The parameterization function $g\_{\theta}$ is differentiable wrt. $\theta$. + + 4. Evaluation of the probability density $q\_{\theta}(z)$ is differentiable wrt. $\theta$. + + 5. $\mathbb{H}\left(q\_{\theta}(z)\right)$ is tractable. + + 6. Evaluation of the joint density $p(x, z)$ is tractable and differentiable wrt. $z$ + + 7. The support of $q(z)$ is a subspace of the support of $p(z \mid x)$ : $\mathrm{supp}\left(q(z)\right) \subseteq \mathrm{supp}\left(p(z \mid x)\right)$. + +All of these are not *necessary* to do VI, but they are very convenient and results in a fairly flexible approach. One distribution which has a density satisfying all of the above assumptions _except_ (7) (we'll get back to this in second) for any tractable and differentiable $p(z \mid \\{ x\_i \\}\_{i = 1}^n)$ is the good ole' Gaussian/normal distribution: + +::: {.column-page} +$$ +z \sim \mathcal{N}(\mu, \Sigma) \quad \iff \quad z = g\_{\mu, L}(\bar{z}) := \mu + L^T \tilde{z} \quad \text{where} \quad \bar{z} \sim \bar{q}(z) := \mathcal{N}(1\_d, I\_{d \times d}) +$$ +::: + +where $\Sigma = L L^T,$ with $L$ obtained from the Cholesky-decomposition. Abusing notation a bit, we're going to write + +::: {.column-page} +$$ +\theta = (\mu, \Sigma) := (\mu\_1, \dots, \mu\_d, L\_{11}, \dots, L\_{1, d}, L\_{2, 1}, \dots, L\_{2, d}, \dots, L\_{d, 1}, \dots, L\_{d, d}). +$$ +::: + +With this assumption we finally have a tractable expression for $\widehat{\mathrm{ELBO}}(q_{\mu, \Sigma})$! Well, assuming (7) is holds. Since a Gaussian has non-zero probability on the entirety of $\mathbb{R}^d$, we also require $p(z \mid \\{ x_i \\}_{i = 1}^n)$ to have non-zero probability on all of $\mathbb{R}^d$. + +Though not necessary, we'll often make a *mean-field* assumption for the variational posterior $q(z)$, i.e. assume independence between the latent variables. In this case, we'll write + +::: {.column-page} +$$ +\theta = (\mu, \sigma^2) := (\mu\_1, \dots, \mu\_d, \sigma\_1^2, \dots, \sigma\_d^2). +$$ +::: + +### Examples + +As a (trivial) example we could apply the approach described above to is the following generative model for $p(z \mid \\{ x_i \\}\_{i = 1}^n)$: + +::: {.column-page} +$$ +\begin{align*} + m &\sim \mathcal{N}(0, 1) \\ + x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, 1), \quad i = 1, \dots, n. +\end{align*} +$$ +::: + +In this case $z = m$ and we have the posterior defined $p(m \mid \\{ x\_i \\}\_{i = 1}^n) = p(m) \prod\_{i = 1}^n p(x\_i \mid m)$. Then the variational posterior would be + +::: {.column-page} +$$ +q\_{\mu, \sigma} = \mathcal{N}(\mu, \sigma^2), \quad \text{where} \quad \mu \in \mathbb{R}, \ \sigma^2 \in \mathbb{R}^{ + }. +$$ +::: + +And since prior of $m$, $\mathcal{N}(0, 1)$, has non-zero probability on the entirety of $\mathbb{R}$, same as $q(m)$, i.e. assumption (7) above holds, everything is fine and life is good. + +But what about this generative model for $p(z \mid \\{ x_i \\}_{i = 1}^n)$: + +::: {.column-page} +$$ +\begin{align*} + s &\sim \mathrm{InverseGamma}(2, 3), \\ + m &\sim \mathcal{N}(0, s), \\ + x\_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, s), \quad i = 1, \dots, n, +\end{align*} +$$ +::: + +with posterior $p(s, m \mid \\{ x\_i \\}\_{i = 1}^n) = p(s) p(m \mid s) \prod\_{i = 1}^n p(x\_i \mid s, m)$ and the mean-field variational posterior $q(s, m)$ will be + +::: {.column-page} +$$ +q\_{\mu\_1, \mu\_2, \sigma\_1^2, \sigma\_2^2}(s, m) = p\_{\mathcal{N}(\mu\_1, \sigma\_1^2)}(s)\ p\_{\mathcal{N}(\mu\_2, \sigma\_2^2)}(m), +$$ +::: + +where we've denoted the evaluation of the probability density of a Gaussian as $p_{\mathcal{N}(\mu, \sigma^2)}(x)$. + +Observe that $\mathrm{InverseGamma}(2, 3)$ has non-zero probability only on $\mathbb{R}^{ + } := (0, \infty)$ which is clearly not all of $\mathbb{R}$ like $q(s, m)$ has, i.e. + +::: {.column-page} +$$ +\mathrm{supp} \left( q(s, m) \right) \not\subseteq \mathrm{supp} \left( p(z \mid \\{ x\_i \\}\_{i = 1}^n) \right). +$$ +::: + +Recall from the definition of the KL-divergence that when this is the case, the KL-divergence isn't well defined. This gets us to the *automatic* part of ADVI. + +### "Automatic"? How? + +For a lot of the standard (continuous) densities $p$ we can actually construct a probability density $\tilde{p}$ with non-zero probability on all of $\mathbb{R}$ by *transforming* the "constrained" probability density $p$ to $\tilde{p}$. In fact, in these cases this is a one-to-one relationship. As we'll see, this helps solve the support-issue we've been going on and on about. + +#### Transforming densities using change of variables + +If we want to compute the probability of $x$ taking a value in some set $A \subseteq \mathrm{supp} \left( p(x) \right)$, we have to integrate $p(x)$ over $A$, i.e. + +::: {.column-page} +$$ +\mathbb{P}_p(x \in A) = \int_A p(x) \mathrm{d}x. +$$ +::: + +This means that if we have a differentiable bijection $f: \mathrm{supp} \left( q(x) \right) \to \mathbb{R}^d$ with differentiable inverse $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(x) \right)$, we can perform a change of variables + +::: {.column-page} +$$ +\mathbb{P}\_p(x \in A) = \int\_{f^{-1}(A)} p \left(f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| \mathrm{d}y, +$$ +::: + +where $\mathcal{J}_{f^{-1}}(x)$ denotes the jacobian of $f^{-1}$ evaluated at $x$. Observe that this defines a probability distribution + +::: {.column-page} +$$ +\mathbb{P}\_{\tilde{p}}\left(y \in f^{-1}(A) \right) = \int\_{f^{-1}(A)} \tilde{p}(y) \mathrm{d}y, +$$ +::: + +since $f^{-1}\left(\mathrm{supp} (p(x)) \right) = \mathbb{R}^d$ which has probability 1. This probability distribution has *density* $\tilde{p}(y)$ with $\mathrm{supp} \left( \tilde{p}(y) \right) = \mathbb{R}^d$, defined + +::: {.column-page} +$$ +\tilde{p}(y) = p \left( f^{-1}(y) \right) \ \left| \det \mathcal{J}\_{f^{-1}}(y) \right| +$$ +::: + +or equivalently + +::: {.column-page} +$$ +\tilde{p} \left( f(x) \right) = \frac{p(x)}{\big| \det \mathcal{J}\_{f}(x) \big|} +$$ +::: + +due to the fact that + +::: {.column-page} +$$ +\big| \det \mathcal{J}\_{f^{-1}}(y) \big| = \big| \det \mathcal{J}\_{f}(x) \big|^{-1} +$$ +::: + +*Note: it's also necessary that the log-abs-det-jacobian term is non-vanishing. This can for example be accomplished by assuming $f$ to also be elementwise monotonic.* + +#### Back to VI + +So why is this is useful? Well, we're looking to generalize our approach using a normal distribution to cases where the supports don't match up. How about defining $q(z)$ by + +::: {.column-page} +$$ +\begin{align*} + \eta &\sim \mathcal{N}(\mu, \Sigma), \\\\ + z &= f^{-1}(\eta), +\end{align*} +$$ +::: + +where $f^{-1}: \mathbb{R}^d \to \mathrm{supp} \left( p(z \mid x) \right)$ is a differentiable bijection with differentiable inverse. Then $z \sim q_{\mu, \Sigma}(z) \implies z \in \mathrm{supp} \left( p(z \mid x) \right)$ as we wanted. The resulting variational density is + +::: {.column-page} +$$ +q\_{\mu, \Sigma}(z) = p\_{\mathcal{N}(\mu, \Sigma)}\left( f(z) \right) \ \big| \det \mathcal{J}\_{f}(z) \big|. +$$ +::: + +Note that the way we've constructed $q(z)$ here is basically a reverse of the approach we described above. Here we sample from a distribution with support on $\mathbb{R}$ and transform *to* $\mathrm{supp} \left( p(z \mid x) \right)$. + +If we want to write the ELBO explicitly in terms of $\eta$ rather than $z$, the first term in the ELBO becomes + +::: {.column-page} +$$ +\begin{align*} + \mathbb{E}\_{z \sim q_{\mu, \Sigma}(z)} \left[ \log p(x\_i, z) \right] &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \Bigg[ \log \frac{p\left(x\_i, f^{-1}(\eta) \right)}{\big| \det \mathcal{J}_{f^{-1}}(\eta) \big|} \Bigg] \\ + &= \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \log p\left(x\_i, f^{-1}(\eta) \right) \right] - \mathbb{E}\_{\eta \sim \mathcal{N}(\mu, \Sigma)} \left[ \left| \det \mathcal{J}\_{f^{-1}}(\eta) \right| \right]. +\end{align*} +$$ +::: + +The entropy is invariant under change of variables, thus $\mathbb{H} \left(q\_{\mu, \Sigma}(z)\right)$ is simply the entropy of the normal distribution which is known analytically. + +Hence, the resulting empirical estimate of the ELBO is + +::: {.column-page} +$$ +\begin{align*} +\widehat{\mathrm{ELBO}}(q\_{\mu, \Sigma}) &= \frac{1}{m} \left( \sum\_{k = 1}^m \sum\_{i = 1}^n \left(\log p\left(x\_i, f^{-1}(\eta_k)\right) - \log \big| \det \mathcal{J}\_{f^{-1}}(\eta\_k) \big| \right) \right) + \mathbb{H} \left(p\_{\mathcal{N}(\mu, \Sigma)}(z)\right) \\ +& \text{where} \quad z\_k \sim \mathcal{N}(\mu, \Sigma) \quad \forall k = 1, \dots, m +\end{align*}. +$$ +::: + +And maximizing this wrt. $\mu$ and $\Sigma$ is what's referred to as **Automatic Differentiation Variational Inference (ADVI)**! + +Now if you want to try it out, [check out the tutorial on how to use ADVI in Turing.jl]({{}})! From fcff397b9b7cd75c9645e4fa9e788e53c4741787 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Thu, 16 Jan 2025 11:28:08 +0000 Subject: [PATCH 4/4] unix2dos --- .../inference/implementing-samplers/index.qmd | 990 +++++++++--------- 1 file changed, 495 insertions(+), 495 deletions(-) diff --git a/developers/inference/implementing-samplers/index.qmd b/developers/inference/implementing-samplers/index.qmd index 847c2f117..9d69fbb80 100644 --- a/developers/inference/implementing-samplers/index.qmd +++ b/developers/inference/implementing-samplers/index.qmd @@ -1,495 +1,495 @@ ---- -title: Implementing Samplers -engine: julia -julia: - exeflags: ["--project=@.", "-t 4"] -aliases: - - ../../tutorials/docs-17-implementing-samplers/index.html ---- - -```{julia} -#| echo: false -#| output: false -using Pkg; -Pkg.instantiate(); -``` - -In this tutorial, we'll go through step-by-step how to implement a "simple" sampler in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) in such a way that it can be easily applied to Turing.jl models. - -In particular, we're going to implement a version of **Metropolis-adjusted Langevin (MALA)**. - -Note that we will implement this sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) framework, completely "ignoring" Turing.jl until the very end of the tutorial, at which point we'll use a single line of code to make the resulting sampler available to Turing.jl. This is to really drive home the point that one can implement samplers in a way that is accessible to all of Turing.jl's users without having to use Turing.jl yourself. - - -## Quick overview of MALA - -We can view MALA as a single step of the leapfrog intergrator with resampling of momentum $p$ at every step.[^2] To make that statement a bit more concrete, we first define the *extended* target $\bar{\gamma}(x, p)$ as - -\begin{equation*} -\log \bar{\gamma}(x, p) \propto \log \gamma(x) + \log \gamma_{\mathcal{N}(0, M)}(p) -\end{equation*} - -where $\gamma_{\mathcal{N}(0, M)}$ denotes the density for a zero-centered Gaussian with covariance matrix $M$. -We then consider targeting this joint distribution over both $x$ and $p$ as follows. -First we define the map - -\begin{equation*} -\begin{split} - L_{\epsilon}: \quad & \mathbb{R}^d \times \mathbb{R}^d \to \mathbb{R}^d \times \mathbb{R}^d \\ - & (x, p) \mapsto (\tilde{x}, \tilde{p}) := L_{\epsilon}(x, p) -\end{split} -\end{equation*} - -as - -\begin{equation*} -\begin{split} - p_{1 / 2} &:= p + \frac{\epsilon}{2} \nabla \log \gamma(x) \\ - \tilde{x} &:= x + \epsilon M^{-1} p_{1 /2 } \\ - p_1 &:= p_{1 / 2} + \frac{\epsilon}{2} \nabla \log \gamma(\tilde{x}) \\ - \tilde{p} &:= - p_1 -\end{split} -\end{equation*} - -This might be familiar for some readers as a single step of the Leapfrog integrator. -We then define the MALA kernel as follows: given the current iterate $x_i$, we sample the next iterate $x_{i + 1}$ as - -\begin{equation*} -\begin{split} - p &\sim \mathcal{N}(0, M) \\ - (\tilde{x}, \tilde{p}) &:= L_{\epsilon}(x_i, p) \\ - \alpha &:= \min \left\{ 1, \frac{\bar{\gamma}(\tilde{x}, \tilde{p})}{\bar{\gamma}(x_i, p)} \right\} \\ - x_{i + 1} &:= - \begin{cases} - \tilde{x} \quad & \text{ with prob. } \alpha \\ - x_i \quad & \text{ with prob. } 1 - \alpha - \end{cases} -\end{split} -\end{equation*} - -i.e. we accept the proposal $\tilde{x}$ with probability $\alpha$ and reject it, thus sticking with our current iterate, with probability $1 - \alpha$. - -## What we need from a model: [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) - -There are a few things we need from the "target" / "model" / density that we want to sample from: - -1. We need access to log-density *evaluations* $\log \gamma(x)$ so we can compute the acceptance ratio involving $\log \bar{\gamma}(x, p)$. -2. We need access to log-density *gradients* $\nabla \log \gamma(x)$ so we can compute the Leapfrog steps $L_{\epsilon}(x, p)$. -3. We also need access to the "size" of the model so we can determine the size of $M$. - -Luckily for us, there is a package called [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) which provides an interface for *exactly* this! - -To demonstrate how one can implement the "[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface"[^1] we will use a simple Gaussian model as an example: - -```{julia} -using LogDensityProblems: LogDensityProblems; - -# Let's define some type that represents the model. -struct IsotropicNormalModel{M<:AbstractVector{<:Real}} - "mean of the isotropic Gaussian" - mean::M -end - -# Specifies what input length the model expects. -LogDensityProblems.dimension(model::IsotropicNormalModel) = length(model.mean) -# Implementation of the log-density evaluation of the model. -function LogDensityProblems.logdensity(model::IsotropicNormalModel, x::AbstractVector{<:Real}) - return - sum(abs2, x .- model.mean) / 2 -end -``` - -This gives us all of the properties we want for our MALA sampler with the exception of the computation of the *gradient* $\nabla \log \gamma(x)$. There is the method `LogDensityProblems.logdensity_and_gradient` which should return a 2-tuple where the first entry is the evaluation of the logdensity $\log \gamma(x)$ and the second entry is the gradient $\nabla \log \gamma(x)$. - -There are two ways to "implement" this method: 1) we implement it by hand, which is feasible in the case of our `IsotropicNormalModel`, or b) we defer the implementation of this to a automatic differentiation backend. - -To implement it by hand we can simply do - -```{julia} -# Tell LogDensityProblems.jl that first-order, i.e. gradient information, is available. -LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{1}() - -# Implement `logdensity_and_gradient`. -function LogDensityProblems.logdensity_and_gradient(model::IsotropicNormalModel, x) - logγ_x = LogDensityProblems.logdensity(model, x) - ∇logγ_x = -x .* (x - model.mean) - return logγ_x, ∇logγ_x -end -``` - -Let's just try it out: - -```{julia} -# Instantiate the problem. -model = IsotropicNormalModel([-5., 0., 5.]) -# Create some example input that we can test on. -x_example = randn(LogDensityProblems.dimension(model)) -# Evaluate! -LogDensityProblems.logdensity(model, x_example) -``` - -To defer it to an automatic differentiation backend, we can do - -```{julia} -# Tell LogDensityProblems.jl we only have access to 0-th order information. -LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{0}() - -# Use `LogDensityProblemsAD`'s `ADgradient` in combination with some AD backend to implement `logdensity_and_gradient`. -using LogDensityProblemsAD, ADTypes, ForwardDiff -model_with_grad = ADgradient(AutoForwardDiff(), model) -LogDensityProblems.logdensity(model_with_grad, x_example) -``` - -We'll continue with the second approach in this tutorial since this is typically what one does in practice, because there are better hobbies to spend time on than deriving gradients by hand. - -At this point, one might wonder how we're going to tie this back to Turing.jl in the end. Effectively, when working with inference methods that only require log-density evaluations and / or higher-order information of the log-density, Turing.jl actually converts the user-provided `Model` into an object implementing the above methods for [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl). As a result, most samplers provided by Turing.jl are actually implemented to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), enabling their use both *within* Turing.jl and *outside* of Turing.jl! Morever, there exists similar conversions for Stan through BridgeStan and Stan[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), which means that a sampler supporting the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface can easily be used on both Turing.jl *and* Stan models (in addition to user-provided models, as our `IsotropicNormalModel` above)! - -Anyways, let's move on to actually implementing the sampler. - -## Implementing MALA in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) - -Now that we've established that a model implementing the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface provides us with all the information we need from $\log \gamma(x)$, we can address the question: given an object that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, how can we define a sampler for it? - -We're going to do this by making our sampler a sub-type of `AbstractMCMC.AbstractSampler` in addition to implementing a few methods from [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl). Why? Because it gets us *a lot* of functionality for free, as we will see later. - -Moreover, [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) provides a very natural interface for MCMC algorithms. - -First, we'll define our `MALA` type - -```{julia} -using AbstractMCMC - -struct MALA{T,A} <: AbstractMCMC.AbstractSampler - "stepsize used in the leapfrog step" - ϵ_init::T - "covariance matrix used for the momentum" - M_init::A -end -``` - -Notice how we've added the suffix `_init` to both the stepsize and the covariance matrix. We've done this because a `AbstractMCMC.AbstractSampler` should be *immutable*. Of course there might be many scenarios where we want to allow something like the stepsize and / or the covariance matrix to vary between iterations, e.g. during the burn-in / adaptation phase of the sampling process we might want to adjust the parameters using statistics computed from these initial iterations. But information which can change between iterations *should not go in the sampler itself*! Instead, this information should go in the sampler *state*. - -The sampler state should at the very least contain all the necessary information to perform the next MCMC iteration, but usually contains further information, e.g. quantities and statistics useful for evaluating whether the sampler has converged. - -We will use the following sampler state for our `MALA` sampler: - -```{julia} -struct MALAState{A<:AbstractVector{<:Real}} - "current position" - x::A -end -``` - -This might seem overly redundant: we're defining a type `MALAState` and it only contains a simple vector of reals. -In this particular case we indeed could have dropped this and simply used a `AbstractVector{<:Real}` as our sampler state, but typically, as we will see later, one wants to include other quantities in the sampler state. -For example, if we also wanted to adapt the parameters of our `MALA`, e.g. alter the stepsize depending on acceptance rates, in which case we should also put `ϵ` in the state, but for now we'll keep things simple. - -Moreover, we also want a _sample_ type, which is a type meant for "public consumption", i.e. the end-user. This is generally going to contain a subset of the information present in the state. But in such a simple scenario as this, we similarly only have a `AbstractVector{<:Real}`: - -```{julia} -struct MALASample{A<:AbstractVector{<:Real}} - "current position" - x::A -end -``` - -We currently have three things: - -1. A `AbstractMCMC.AbstractSampler` implementation called `MALA`. -2. A state `MALAState` for our sampler `MALA`. -3. A sample `MALASample` for our sampler `MALA`. - -That means that we're ready to implement the only thing that really matters: `AbstractMCMC.step`. - -`AbstractMCMC.step` defines the MCMC iteration of our `MALA` given the current `MALAState`. Specifically, the signature of the function is as follows: - -```{julia} -#| eval: false -function AbstractMCMC.step( - # The RNG to ensure reproducibility. - rng::Random.AbstractRNG, - # The model that defines our target. - model::AbstractMCMC.AbstractModel, - # The sampler for which we're taking a `step`. - sampler::AbstractMCMC.AbstractSampler, - # The current sampler `state`. - state; - # Additional keyword arguments that we may or may not need. - kwargs... -) -``` - -Moreover, there is a specific `AbstractMCMC.AbstractModel` which is used to indicate that the model that is provided implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface: `AbstractMCMC.LogDensityModel`. - -Since, as we discussed earlier, in our case we're indeed going to work with types that support the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, we'll define `AbstractMCMC.step` for such a `AbstractMCMC.LogDensityModel`. - -Note that `AbstractMCMC.LogDensityModel` has no other purpose; it has a single field called `logdensity`, and it does nothing else. But by wrapping the model in `AbstractMCMC.LogDensityModel`, it allows samplers that want to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) to define their `AbstractMCMC.step` on this type without running into method ambiguities. - -All in all, that means that the signature for our `AbstractMCMC.step` is going to be the following: - -```{julia} -#| eval: false -function AbstractMCMC.step( - rng::Random.AbstractRNG, - # `LogDensityModel` so we know we're working with LogDensityProblems.jl model. - model::AbstractMCMC.LogDensityModel, - # Our sampler. - sampler::MALA, - # Our sampler state. - state::MALAState; - kwargs... -) -``` - -Great! Now let's actually implement the full `AbstractMCMC.step` for our `MALA`. - -Let's remind ourselves what we're going to do: - -1. Sample a new momentum $p$. -2. Compute the log-density of the extended target $\log \bar{\gamma}(x, p)$. -3. Take a single leapfrog step $(\tilde{x}, \tilde{p}) = L_{\epsilon}(x, p)$. -4. Accept or reject the proposed $(\tilde{x}, \tilde{p})$. - -All in all, this results in the following: - -```{julia} -using Random: Random -using Distributions # so we get the `MvNormal` - -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model_wrapper::AbstractMCMC.LogDensityModel, - sampler::MALA, - state::MALAState; - kwargs... -) - # Extract the wrapped model which implements LogDensityProblems.jl. - model = model_wrapper.logdensity - # Let's just extract the sampler parameters to make our lives easier. - ϵ = sampler.ϵ_init - M = sampler.M_init - # Extract the current parameters. - x = state.x - # Sample the momentum. - p_dist = MvNormal(zeros(LogDensityProblems.dimension(model)), M) - p = rand(rng, p_dist) - # Propose using a single leapfrog step. - x̃, p̃ = leapfrog_step(model, x, p, ϵ, M) - # Accept or reject proposal. - logp = LogDensityProblems.logdensity(model, x) + logpdf(p_dist, p) - logp̃ = LogDensityProblems.logdensity(model, x̃) + logpdf(p_dist, p̃) - logα = logp̃ - logp - state_new = if log(rand(rng)) < logα - # Accept. - MALAState(x̃) - else - # Reject. - MALAState(x) - end - # Return the "sample" and the sampler state. - return MALASample(state_new.x), state_new -end -``` - -Fairly straight-forward. - -Of course, we haven't defined the `leapfrog_step` method yet, so let's do that: - -```{julia} -function leapfrog_step(model, x, p, ϵ, M) - # Update momentum `p` using "position" `x`. - ∇logγ_x = last(LogDensityProblems.logdensity_and_gradient(model, x)) - p1 = p + (ϵ / 2) .* ∇logγ_x - # Update the "position" `x` using momentum `p1`. - x̃ = x + ϵ .* (M \ p1) - # Update momentum `p1` using position `x̃` - ∇logγ_x̃ = last(LogDensityProblems.logdensity_and_gradient(model, x̃)) - p2 = p1 + (ϵ / 2) .* ∇logγ_x̃ - # Flip momentum `p2`. - p̃ = -p2 - return x̃, p̃ -end -``` - -With all of this, we're technically ready to sample! - -```{julia} -using Random, LinearAlgebra - -rng = Random.default_rng() -sampler = MALA(1, I) -state = MALAState(zeros(LogDensityProblems.dimension(model))) - -x_next, state_next = AbstractMCMC.step( - rng, - AbstractMCMC.LogDensityModel(model), - sampler, - state -) -``` - -Great, it works! - -And I promised we would get quite some functionality for free if we implemented `AbstractMCMC.step`, and so we can now simply call `sample` to perform standard MCMC sampling: - -```{julia} -# Perform 1000 iterations with our `MALA` sampler. -samples = sample(model_with_grad, sampler, 10_000; initial_state=state, progress=false) -# Concatenate into a matrix. -samples_matrix = stack(sample -> sample.x, samples) -``` - -```{julia} -# Compute the marginal means and standard deviations. -hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) -``` - -Let's visualize the samples - -```{julia} -using StatsPlots -plot(transpose(samples_matrix[:, 1:10:end]), alpha=0.5, legend=false) -``` - -Look at that! Things are working; amazin'. - -We can also exploit [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl)'s parallel sampling capabilities: - -```{julia} -# Run separate 4 chains for 10 000 iterations using threads to parallelize. -num_chains = 4 -samples = sample( - model_with_grad, - sampler, - MCMCThreads(), - 10_000, - num_chains; - # Note we need to provide an initial state for every chain. - initial_state=fill(state, num_chains), - progress=false -) -samples_array = stack(map(Base.Fix1(stack, sample -> sample.x), samples)) -``` - -But the fact that we have to provide the `AbstractMCMC.sample` call, etc. with an `initial_state` to get started is a bit annoying. We can avoid this by also defining a `AbstractMCMC.step` *without* the `state` argument: - -```{julia} -function AbstractMCMC.step( - rng::Random.AbstractRNG, - model_wrapper::AbstractMCMC.LogDensityModel, - ::MALA; - # NOTE: No state provided! - kwargs... -) - model = model_wrapper.logdensity - # Let's just create the initial state by sampling using a Gaussian. - x = randn(rng, LogDensityProblems.dimension(model)) - - return MALASample(x), MALAState(x) -end -``` - -Equipped with this, we no longer need to provide the `initial_state` everywhere: - -```{julia} -samples = sample(model_with_grad, sampler, 10_000; progress=false) -samples_matrix = stack(sample -> sample.x, samples) -hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) -``` - -## Using our sampler with Turing.jl - -As we promised, all of this hassle of implementing our `MALA` sampler in a way that uses [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) and [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) gets us something more than *just* an "automatic" implementation of `AbstractMCMC.sample`. - -It also enables use with Turing.jl through the `externalsampler`, but we need to do one final thing first: we need to tell Turing.jl how to extract a vector of parameters from the "sample" returned in our implementation of `AbstractMCMC.step`. In our case, the "sample" is a `MALASample`, so we just need the following line: - -```{julia} -# Load Turing.jl. -using Turing - -# Overload the `getparams` method for our "sample" type, which is just a vector. -Turing.Inference.getparams(::Turing.Model, sample::MALASample) = sample.x -``` - -And with that, we're good to go! - -```{julia} -# Our previous model defined as a Turing.jl model. -@model mvnormal_model() = x ~ MvNormal([-5., 0., 5.], I) -# Instantiate our model. -turing_model = mvnormal_model() -# Call `sample` but now we're passing in a Turing.jl `model` and wrapping -# our `MALA` sampler in the `externalsampler` to tell Turing.jl that the sampler -# expects something that implements LogDensityProblems.jl. -chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) -``` - -Pretty neat, eh? - -### Models with constrained parameters - -One thing we've sort of glossed over in all of the above is that MALA, at least how we've implemented it, requires $x$ to live in $\mathbb{R}^d$ for some $d > 0$. If some of the parameters were in fact constrained, e.g. we were working with a `Beta` distribution which has support on the interval $(0, 1)$, *not* on $\mathbb{R}^d$, we could easily end up outside of the valid range $(0, 1)$. - -```{julia} -@model beta_model() = x ~ Beta(3, 3) -turing_model = beta_model() -chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) -``` - -Yep, that still works, but only because Turing.jl actually *transforms* the `turing_model` from constrained to unconstrained, so that the `sampler` provided to `externalsampler` is actually always working in unconstrained space! This is not always desirable, so we can turn this off: - -```{julia} -chain = sample(turing_model, externalsampler(sampler; unconstrained=false), 10_000; progress=false) -``` - -The fun thing is that this still sort of works because - -```{julia} -logpdf(Beta(3, 3), 10.0) -``` - -and so the samples that fall outside of the range are always rejected. But do notice how much worse all the diagnostics are, e.g. `ess_tail` is very poor compared to when we use `unconstrained=true`. Moreover, in more complex cases this won't just result in a "nice" `-Inf` log-density value, but instead will error: - -```{julia} -#| error: true -@model function demo() - σ² ~ truncated(Normal(), lower=0) - # If we end up with negative values for `σ²`, the `Normal` will error. - x ~ Normal(0, σ²) -end -sample(demo(), externalsampler(sampler; unconstrained=false), 10_000; progress=false) -``` - -As expected, we run into a `DomainError` at some point, while if we set `unconstrained=true`, letting Turing.jl transform the model to a unconstrained form behind the scenes, everything works as expected: - -```{julia} -sample(demo(), externalsampler(sampler; unconstrained=true), 10_000; progress=false) -``` - -Neat! - -Similarly, which automatic differentiation backend one should use can be specified through the `adtype` keyword argument too. For example, if we want to use [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) instead of the default [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl): - -```{julia} -using ReverseDiff: ReverseDiff -# Specify that we want to use `AutoReverseDiff`. -sample( - demo(), - externalsampler(sampler; unconstrained=true, adtype=AutoReverseDiff()), - 10_000; - progress=false -) -``` - -Double-neat. - -## Summary - -At this point it's worth maybe reminding ourselves what we did and also *why* we did it: - -1. We define our models in the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface because it makes the sampler agnostic to how the underlying model is implemented. -2. We implement our sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) interface, which just means that our sampler is a subtype of `AbstractMCMC.AbstractSampler` and we implement the MCMC transition in `AbstractMCMC.step`. -3. Points 1 and 2 makes it so our sampler can be used with a wide range of model implementations, amongst them being models implemented in both Turing.jl and Stan. This gives you, the inference implementer, a large collection of models to test your inference method on, in addition to allowing users of Turing.jl and Stan to try out your inference method with minimal effort. - -[^1]: There is no such thing as a proper interface in Julia (at least not officially), and so we use the word "interface" here to mean a few minimal methods that needs to be implemented by any type that we treat as a target model. - -[^2]: We're going with the leapfrog formulation because in a future version of this tutorial we'll add a section extending this simple "baseline" MALA sampler to more complex versions. See [issue #479](https://github.com/TuringLang/docs/issues/479) for progress on this. +--- +title: Implementing Samplers +engine: julia +julia: + exeflags: ["--project=@.", "-t 4"] +aliases: + - ../../tutorials/docs-17-implementing-samplers/index.html +--- + +```{julia} +#| echo: false +#| output: false +using Pkg; +Pkg.instantiate(); +``` + +In this tutorial, we'll go through step-by-step how to implement a "simple" sampler in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) in such a way that it can be easily applied to Turing.jl models. + +In particular, we're going to implement a version of **Metropolis-adjusted Langevin (MALA)**. + +Note that we will implement this sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) framework, completely "ignoring" Turing.jl until the very end of the tutorial, at which point we'll use a single line of code to make the resulting sampler available to Turing.jl. This is to really drive home the point that one can implement samplers in a way that is accessible to all of Turing.jl's users without having to use Turing.jl yourself. + + +## Quick overview of MALA + +We can view MALA as a single step of the leapfrog intergrator with resampling of momentum $p$ at every step.[^2] To make that statement a bit more concrete, we first define the *extended* target $\bar{\gamma}(x, p)$ as + +\begin{equation*} +\log \bar{\gamma}(x, p) \propto \log \gamma(x) + \log \gamma_{\mathcal{N}(0, M)}(p) +\end{equation*} + +where $\gamma_{\mathcal{N}(0, M)}$ denotes the density for a zero-centered Gaussian with covariance matrix $M$. +We then consider targeting this joint distribution over both $x$ and $p$ as follows. +First we define the map + +\begin{equation*} +\begin{split} + L_{\epsilon}: \quad & \mathbb{R}^d \times \mathbb{R}^d \to \mathbb{R}^d \times \mathbb{R}^d \\ + & (x, p) \mapsto (\tilde{x}, \tilde{p}) := L_{\epsilon}(x, p) +\end{split} +\end{equation*} + +as + +\begin{equation*} +\begin{split} + p_{1 / 2} &:= p + \frac{\epsilon}{2} \nabla \log \gamma(x) \\ + \tilde{x} &:= x + \epsilon M^{-1} p_{1 /2 } \\ + p_1 &:= p_{1 / 2} + \frac{\epsilon}{2} \nabla \log \gamma(\tilde{x}) \\ + \tilde{p} &:= - p_1 +\end{split} +\end{equation*} + +This might be familiar for some readers as a single step of the Leapfrog integrator. +We then define the MALA kernel as follows: given the current iterate $x_i$, we sample the next iterate $x_{i + 1}$ as + +\begin{equation*} +\begin{split} + p &\sim \mathcal{N}(0, M) \\ + (\tilde{x}, \tilde{p}) &:= L_{\epsilon}(x_i, p) \\ + \alpha &:= \min \left\{ 1, \frac{\bar{\gamma}(\tilde{x}, \tilde{p})}{\bar{\gamma}(x_i, p)} \right\} \\ + x_{i + 1} &:= + \begin{cases} + \tilde{x} \quad & \text{ with prob. } \alpha \\ + x_i \quad & \text{ with prob. } 1 - \alpha + \end{cases} +\end{split} +\end{equation*} + +i.e. we accept the proposal $\tilde{x}$ with probability $\alpha$ and reject it, thus sticking with our current iterate, with probability $1 - \alpha$. + +## What we need from a model: [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) + +There are a few things we need from the "target" / "model" / density that we want to sample from: + +1. We need access to log-density *evaluations* $\log \gamma(x)$ so we can compute the acceptance ratio involving $\log \bar{\gamma}(x, p)$. +2. We need access to log-density *gradients* $\nabla \log \gamma(x)$ so we can compute the Leapfrog steps $L_{\epsilon}(x, p)$. +3. We also need access to the "size" of the model so we can determine the size of $M$. + +Luckily for us, there is a package called [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) which provides an interface for *exactly* this! + +To demonstrate how one can implement the "[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface"[^1] we will use a simple Gaussian model as an example: + +```{julia} +using LogDensityProblems: LogDensityProblems; + +# Let's define some type that represents the model. +struct IsotropicNormalModel{M<:AbstractVector{<:Real}} + "mean of the isotropic Gaussian" + mean::M +end + +# Specifies what input length the model expects. +LogDensityProblems.dimension(model::IsotropicNormalModel) = length(model.mean) +# Implementation of the log-density evaluation of the model. +function LogDensityProblems.logdensity(model::IsotropicNormalModel, x::AbstractVector{<:Real}) + return - sum(abs2, x .- model.mean) / 2 +end +``` + +This gives us all of the properties we want for our MALA sampler with the exception of the computation of the *gradient* $\nabla \log \gamma(x)$. There is the method `LogDensityProblems.logdensity_and_gradient` which should return a 2-tuple where the first entry is the evaluation of the logdensity $\log \gamma(x)$ and the second entry is the gradient $\nabla \log \gamma(x)$. + +There are two ways to "implement" this method: 1) we implement it by hand, which is feasible in the case of our `IsotropicNormalModel`, or b) we defer the implementation of this to a automatic differentiation backend. + +To implement it by hand we can simply do + +```{julia} +# Tell LogDensityProblems.jl that first-order, i.e. gradient information, is available. +LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{1}() + +# Implement `logdensity_and_gradient`. +function LogDensityProblems.logdensity_and_gradient(model::IsotropicNormalModel, x) + logγ_x = LogDensityProblems.logdensity(model, x) + ∇logγ_x = -x .* (x - model.mean) + return logγ_x, ∇logγ_x +end +``` + +Let's just try it out: + +```{julia} +# Instantiate the problem. +model = IsotropicNormalModel([-5., 0., 5.]) +# Create some example input that we can test on. +x_example = randn(LogDensityProblems.dimension(model)) +# Evaluate! +LogDensityProblems.logdensity(model, x_example) +``` + +To defer it to an automatic differentiation backend, we can do + +```{julia} +# Tell LogDensityProblems.jl we only have access to 0-th order information. +LogDensityProblems.capabilities(model::IsotropicNormalModel) = LogDensityProblems.LogDensityOrder{0}() + +# Use `LogDensityProblemsAD`'s `ADgradient` in combination with some AD backend to implement `logdensity_and_gradient`. +using LogDensityProblemsAD, ADTypes, ForwardDiff +model_with_grad = ADgradient(AutoForwardDiff(), model) +LogDensityProblems.logdensity(model_with_grad, x_example) +``` + +We'll continue with the second approach in this tutorial since this is typically what one does in practice, because there are better hobbies to spend time on than deriving gradients by hand. + +At this point, one might wonder how we're going to tie this back to Turing.jl in the end. Effectively, when working with inference methods that only require log-density evaluations and / or higher-order information of the log-density, Turing.jl actually converts the user-provided `Model` into an object implementing the above methods for [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl). As a result, most samplers provided by Turing.jl are actually implemented to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), enabling their use both *within* Turing.jl and *outside* of Turing.jl! Morever, there exists similar conversions for Stan through BridgeStan and Stan[LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl), which means that a sampler supporting the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface can easily be used on both Turing.jl *and* Stan models (in addition to user-provided models, as our `IsotropicNormalModel` above)! + +Anyways, let's move on to actually implementing the sampler. + +## Implementing MALA in [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) + +Now that we've established that a model implementing the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface provides us with all the information we need from $\log \gamma(x)$, we can address the question: given an object that implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, how can we define a sampler for it? + +We're going to do this by making our sampler a sub-type of `AbstractMCMC.AbstractSampler` in addition to implementing a few methods from [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl). Why? Because it gets us *a lot* of functionality for free, as we will see later. + +Moreover, [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) provides a very natural interface for MCMC algorithms. + +First, we'll define our `MALA` type + +```{julia} +using AbstractMCMC + +struct MALA{T,A} <: AbstractMCMC.AbstractSampler + "stepsize used in the leapfrog step" + ϵ_init::T + "covariance matrix used for the momentum" + M_init::A +end +``` + +Notice how we've added the suffix `_init` to both the stepsize and the covariance matrix. We've done this because a `AbstractMCMC.AbstractSampler` should be *immutable*. Of course there might be many scenarios where we want to allow something like the stepsize and / or the covariance matrix to vary between iterations, e.g. during the burn-in / adaptation phase of the sampling process we might want to adjust the parameters using statistics computed from these initial iterations. But information which can change between iterations *should not go in the sampler itself*! Instead, this information should go in the sampler *state*. + +The sampler state should at the very least contain all the necessary information to perform the next MCMC iteration, but usually contains further information, e.g. quantities and statistics useful for evaluating whether the sampler has converged. + +We will use the following sampler state for our `MALA` sampler: + +```{julia} +struct MALAState{A<:AbstractVector{<:Real}} + "current position" + x::A +end +``` + +This might seem overly redundant: we're defining a type `MALAState` and it only contains a simple vector of reals. +In this particular case we indeed could have dropped this and simply used a `AbstractVector{<:Real}` as our sampler state, but typically, as we will see later, one wants to include other quantities in the sampler state. +For example, if we also wanted to adapt the parameters of our `MALA`, e.g. alter the stepsize depending on acceptance rates, in which case we should also put `ϵ` in the state, but for now we'll keep things simple. + +Moreover, we also want a _sample_ type, which is a type meant for "public consumption", i.e. the end-user. This is generally going to contain a subset of the information present in the state. But in such a simple scenario as this, we similarly only have a `AbstractVector{<:Real}`: + +```{julia} +struct MALASample{A<:AbstractVector{<:Real}} + "current position" + x::A +end +``` + +We currently have three things: + +1. A `AbstractMCMC.AbstractSampler` implementation called `MALA`. +2. A state `MALAState` for our sampler `MALA`. +3. A sample `MALASample` for our sampler `MALA`. + +That means that we're ready to implement the only thing that really matters: `AbstractMCMC.step`. + +`AbstractMCMC.step` defines the MCMC iteration of our `MALA` given the current `MALAState`. Specifically, the signature of the function is as follows: + +```{julia} +#| eval: false +function AbstractMCMC.step( + # The RNG to ensure reproducibility. + rng::Random.AbstractRNG, + # The model that defines our target. + model::AbstractMCMC.AbstractModel, + # The sampler for which we're taking a `step`. + sampler::AbstractMCMC.AbstractSampler, + # The current sampler `state`. + state; + # Additional keyword arguments that we may or may not need. + kwargs... +) +``` + +Moreover, there is a specific `AbstractMCMC.AbstractModel` which is used to indicate that the model that is provided implements the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface: `AbstractMCMC.LogDensityModel`. + +Since, as we discussed earlier, in our case we're indeed going to work with types that support the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface, we'll define `AbstractMCMC.step` for such a `AbstractMCMC.LogDensityModel`. + +Note that `AbstractMCMC.LogDensityModel` has no other purpose; it has a single field called `logdensity`, and it does nothing else. But by wrapping the model in `AbstractMCMC.LogDensityModel`, it allows samplers that want to work with [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) to define their `AbstractMCMC.step` on this type without running into method ambiguities. + +All in all, that means that the signature for our `AbstractMCMC.step` is going to be the following: + +```{julia} +#| eval: false +function AbstractMCMC.step( + rng::Random.AbstractRNG, + # `LogDensityModel` so we know we're working with LogDensityProblems.jl model. + model::AbstractMCMC.LogDensityModel, + # Our sampler. + sampler::MALA, + # Our sampler state. + state::MALAState; + kwargs... +) +``` + +Great! Now let's actually implement the full `AbstractMCMC.step` for our `MALA`. + +Let's remind ourselves what we're going to do: + +1. Sample a new momentum $p$. +2. Compute the log-density of the extended target $\log \bar{\gamma}(x, p)$. +3. Take a single leapfrog step $(\tilde{x}, \tilde{p}) = L_{\epsilon}(x, p)$. +4. Accept or reject the proposed $(\tilde{x}, \tilde{p})$. + +All in all, this results in the following: + +```{julia} +using Random: Random +using Distributions # so we get the `MvNormal` + +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model_wrapper::AbstractMCMC.LogDensityModel, + sampler::MALA, + state::MALAState; + kwargs... +) + # Extract the wrapped model which implements LogDensityProblems.jl. + model = model_wrapper.logdensity + # Let's just extract the sampler parameters to make our lives easier. + ϵ = sampler.ϵ_init + M = sampler.M_init + # Extract the current parameters. + x = state.x + # Sample the momentum. + p_dist = MvNormal(zeros(LogDensityProblems.dimension(model)), M) + p = rand(rng, p_dist) + # Propose using a single leapfrog step. + x̃, p̃ = leapfrog_step(model, x, p, ϵ, M) + # Accept or reject proposal. + logp = LogDensityProblems.logdensity(model, x) + logpdf(p_dist, p) + logp̃ = LogDensityProblems.logdensity(model, x̃) + logpdf(p_dist, p̃) + logα = logp̃ - logp + state_new = if log(rand(rng)) < logα + # Accept. + MALAState(x̃) + else + # Reject. + MALAState(x) + end + # Return the "sample" and the sampler state. + return MALASample(state_new.x), state_new +end +``` + +Fairly straight-forward. + +Of course, we haven't defined the `leapfrog_step` method yet, so let's do that: + +```{julia} +function leapfrog_step(model, x, p, ϵ, M) + # Update momentum `p` using "position" `x`. + ∇logγ_x = last(LogDensityProblems.logdensity_and_gradient(model, x)) + p1 = p + (ϵ / 2) .* ∇logγ_x + # Update the "position" `x` using momentum `p1`. + x̃ = x + ϵ .* (M \ p1) + # Update momentum `p1` using position `x̃` + ∇logγ_x̃ = last(LogDensityProblems.logdensity_and_gradient(model, x̃)) + p2 = p1 + (ϵ / 2) .* ∇logγ_x̃ + # Flip momentum `p2`. + p̃ = -p2 + return x̃, p̃ +end +``` + +With all of this, we're technically ready to sample! + +```{julia} +using Random, LinearAlgebra + +rng = Random.default_rng() +sampler = MALA(1, I) +state = MALAState(zeros(LogDensityProblems.dimension(model))) + +x_next, state_next = AbstractMCMC.step( + rng, + AbstractMCMC.LogDensityModel(model), + sampler, + state +) +``` + +Great, it works! + +And I promised we would get quite some functionality for free if we implemented `AbstractMCMC.step`, and so we can now simply call `sample` to perform standard MCMC sampling: + +```{julia} +# Perform 1000 iterations with our `MALA` sampler. +samples = sample(model_with_grad, sampler, 10_000; initial_state=state, progress=false) +# Concatenate into a matrix. +samples_matrix = stack(sample -> sample.x, samples) +``` + +```{julia} +# Compute the marginal means and standard deviations. +hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) +``` + +Let's visualize the samples + +```{julia} +using StatsPlots +plot(transpose(samples_matrix[:, 1:10:end]), alpha=0.5, legend=false) +``` + +Look at that! Things are working; amazin'. + +We can also exploit [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl)'s parallel sampling capabilities: + +```{julia} +# Run separate 4 chains for 10 000 iterations using threads to parallelize. +num_chains = 4 +samples = sample( + model_with_grad, + sampler, + MCMCThreads(), + 10_000, + num_chains; + # Note we need to provide an initial state for every chain. + initial_state=fill(state, num_chains), + progress=false +) +samples_array = stack(map(Base.Fix1(stack, sample -> sample.x), samples)) +``` + +But the fact that we have to provide the `AbstractMCMC.sample` call, etc. with an `initial_state` to get started is a bit annoying. We can avoid this by also defining a `AbstractMCMC.step` *without* the `state` argument: + +```{julia} +function AbstractMCMC.step( + rng::Random.AbstractRNG, + model_wrapper::AbstractMCMC.LogDensityModel, + ::MALA; + # NOTE: No state provided! + kwargs... +) + model = model_wrapper.logdensity + # Let's just create the initial state by sampling using a Gaussian. + x = randn(rng, LogDensityProblems.dimension(model)) + + return MALASample(x), MALAState(x) +end +``` + +Equipped with this, we no longer need to provide the `initial_state` everywhere: + +```{julia} +samples = sample(model_with_grad, sampler, 10_000; progress=false) +samples_matrix = stack(sample -> sample.x, samples) +hcat(mean(samples_matrix; dims=2), std(samples_matrix; dims=2)) +``` + +## Using our sampler with Turing.jl + +As we promised, all of this hassle of implementing our `MALA` sampler in a way that uses [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) and [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) gets us something more than *just* an "automatic" implementation of `AbstractMCMC.sample`. + +It also enables use with Turing.jl through the `externalsampler`, but we need to do one final thing first: we need to tell Turing.jl how to extract a vector of parameters from the "sample" returned in our implementation of `AbstractMCMC.step`. In our case, the "sample" is a `MALASample`, so we just need the following line: + +```{julia} +# Load Turing.jl. +using Turing + +# Overload the `getparams` method for our "sample" type, which is just a vector. +Turing.Inference.getparams(::Turing.Model, sample::MALASample) = sample.x +``` + +And with that, we're good to go! + +```{julia} +# Our previous model defined as a Turing.jl model. +@model mvnormal_model() = x ~ MvNormal([-5., 0., 5.], I) +# Instantiate our model. +turing_model = mvnormal_model() +# Call `sample` but now we're passing in a Turing.jl `model` and wrapping +# our `MALA` sampler in the `externalsampler` to tell Turing.jl that the sampler +# expects something that implements LogDensityProblems.jl. +chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) +``` + +Pretty neat, eh? + +### Models with constrained parameters + +One thing we've sort of glossed over in all of the above is that MALA, at least how we've implemented it, requires $x$ to live in $\mathbb{R}^d$ for some $d > 0$. If some of the parameters were in fact constrained, e.g. we were working with a `Beta` distribution which has support on the interval $(0, 1)$, *not* on $\mathbb{R}^d$, we could easily end up outside of the valid range $(0, 1)$. + +```{julia} +@model beta_model() = x ~ Beta(3, 3) +turing_model = beta_model() +chain = sample(turing_model, externalsampler(sampler), 10_000; progress=false) +``` + +Yep, that still works, but only because Turing.jl actually *transforms* the `turing_model` from constrained to unconstrained, so that the `sampler` provided to `externalsampler` is actually always working in unconstrained space! This is not always desirable, so we can turn this off: + +```{julia} +chain = sample(turing_model, externalsampler(sampler; unconstrained=false), 10_000; progress=false) +``` + +The fun thing is that this still sort of works because + +```{julia} +logpdf(Beta(3, 3), 10.0) +``` + +and so the samples that fall outside of the range are always rejected. But do notice how much worse all the diagnostics are, e.g. `ess_tail` is very poor compared to when we use `unconstrained=true`. Moreover, in more complex cases this won't just result in a "nice" `-Inf` log-density value, but instead will error: + +```{julia} +#| error: true +@model function demo() + σ² ~ truncated(Normal(), lower=0) + # If we end up with negative values for `σ²`, the `Normal` will error. + x ~ Normal(0, σ²) +end +sample(demo(), externalsampler(sampler; unconstrained=false), 10_000; progress=false) +``` + +As expected, we run into a `DomainError` at some point, while if we set `unconstrained=true`, letting Turing.jl transform the model to a unconstrained form behind the scenes, everything works as expected: + +```{julia} +sample(demo(), externalsampler(sampler; unconstrained=true), 10_000; progress=false) +``` + +Neat! + +Similarly, which automatic differentiation backend one should use can be specified through the `adtype` keyword argument too. For example, if we want to use [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) instead of the default [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl): + +```{julia} +using ReverseDiff: ReverseDiff +# Specify that we want to use `AutoReverseDiff`. +sample( + demo(), + externalsampler(sampler; unconstrained=true, adtype=AutoReverseDiff()), + 10_000; + progress=false +) +``` + +Double-neat. + +## Summary + +At this point it's worth maybe reminding ourselves what we did and also *why* we did it: + +1. We define our models in the [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface because it makes the sampler agnostic to how the underlying model is implemented. +2. We implement our sampler in the [AbstractMCMC.jl](https://github.com/TuringLang/AbstractMCMC.jl) interface, which just means that our sampler is a subtype of `AbstractMCMC.AbstractSampler` and we implement the MCMC transition in `AbstractMCMC.step`. +3. Points 1 and 2 makes it so our sampler can be used with a wide range of model implementations, amongst them being models implemented in both Turing.jl and Stan. This gives you, the inference implementer, a large collection of models to test your inference method on, in addition to allowing users of Turing.jl and Stan to try out your inference method with minimal effort. + +[^1]: There is no such thing as a proper interface in Julia (at least not officially), and so we use the word "interface" here to mean a few minimal methods that needs to be implemented by any type that we treat as a target model. + +[^2]: We're going with the leapfrog formulation because in a future version of this tutorial we'll add a section extending this simple "baseline" MALA sampler to more complex versions. See [issue #479](https://github.com/TuringLang/docs/issues/479) for progress on this.