VIF and GVIF (#300)

palday · nalimilan · web-flow · commit 2c73de38d515 · 2023-09-13T07:18:26.000Z
* VIF and GVIF

* manual entry

* Julia 1.6 compat

* remove dead code

* VERSION check

* tests

* tolerance

* let documenter find the StatsAPI docstring on its own

* efficiency ftw

* piracy check

* fix _find_intercept with nesting

* Aqua 0.7

* add R run for reference values as comment

---------

Co-authored-by: Milan Bouchet-Valat &lt;nalimilan@club.fr&gt;
diff --git a/Project.toml b/Project.toml
@@ -16,13 +16,13 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Aqua = "0.6"
+Aqua = "0.7"
 CategoricalArrays = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
 DataAPI = "1.1"
 DataFrames = "1"
 DataStructures = "0.17, 0.18"
 ShiftedArrays = "1, 2"
-StatsAPI = "1"
+StatsAPI = "1.7"
 StatsBase = "0.33.5, 0.34"
 StatsFuns = "0.9, 1.0"
 Tables = "0.2, 1"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, StatsModels
+using Documenter, StatsModels, StatsAPI
 
 DocMeta.setdocmeta!(StatsModels, :DocTestSetup, :(using StatsModels, StatsBase); recursive=true)
 
@@ -15,7 +15,7 @@ makedocs(
         "Temporal variables and Time Series Terms" => "temporal_terms.md",
         "API documentation" => "api.md"
     ],
-    modules = [StatsModels],
+    modules = [StatsModels, StatsAPI],
     doctestfilters = [r"([a-z]*) => \1", r"getfield\(.*##[0-9]+#[0-9]+"],
     strict=Documenter.except(:missing_docs)
 )
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -74,10 +74,12 @@ apply_schema
 
 ```@docs
 fit
-response
-modelmatrix
+gvif
 lrtest
 formula
+modelmatrix
+response
+vif
 ```
 
 ### Traits
diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -9,7 +9,7 @@ using DataStructures
 using DataAPI
 using DataAPI: levels
 using Printf: @sprintf
-using StatsAPI: coefnames, fit, predict, dof
+using StatsAPI: coefnames, dof, fit, gvif, predict, vif
 using StatsFuns: chisqccdf
 
 using SparseArrays
@@ -70,7 +70,10 @@ export
     omitsintercept,
     hasresponse,
 
-    lrtest
+    lrtest,
+
+    vif,
+    gvif
 
 const SPECIALS = (:+, :&, :*, :~)
 
@@ -84,6 +87,7 @@ include("formula.jl")
 include("modelframe.jl")
 include("statsmodel.jl")
 include("lrtest.jl")
+include("vif.jl")
 include("deprecated.jl")
 
 end # module StatsModels
diff --git a/src/vif.jl b/src/vif.jl
@@ -0,0 +1,121 @@
+# putting this behind a function barrier means that the model matrix
+# can potentially be freed immediately if it's large and constructed on the fly
+function _find_intercept(model::RegressionModel)
+    modelmat = modelmatrix(model)
+    cols = eachcol(modelmat)
+    # collect is necessary for Julia 1.6
+    # but it's :just: an array of references to views, so shouldn't be too
+    # expensive
+    @static if VERSION < v"1.7"
+        cols = collect(cols)
+    end
+    return findfirst(Base.Fix1(all, isone), cols)
+end
+
+_find_intercept(form::FormulaTerm) = _find_intercept(form.rhs)
+# we need these in case the RHS is a single term
+_find_intercept(::AbstractTerm) = nothing
+_find_intercept(::InterceptTerm{true}) = 1
+_find_intercept(m::MatrixTerm) = _find_intercept(m.terms)
+function _find_intercept(t::TupleTerm) 
+    return findfirst(!isnothing ∘ _find_intercept, t)
+end
+
+# borrowed from Effects.jl
+function get_matrix_term(x)
+    x = collect_matrix_terms(x)
+    x = x isa MatrixTerm ? x : first(x)
+    x isa MatrixTerm || throw(ArgumentError("couldn't extract matrix term from $x"))
+    # this is a work around for some weirdness that happens in MixedModels.jl
+    x = first(x.terms) isa MatrixTerm ? only(x.terms) : x
+    return x
+end
+
+function StatsAPI.vif(model::RegressionModel)
+    vc = vcov(model)
+    Base.require_one_based_indexing(vc)
+
+    intercept = _find_intercept(model)
+    isnothing(intercept) &&
+        throw(ArgumentError("VIF is only defined for models with an intercept term"))
+
+    # copy just in case vc was a reference to an internal structure
+    vc = StatsBase.cov2cor!(copy(vc), stderror(model))
+    m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
+    size(m, 2) > 1 ||
+        throw(ArgumentError("VIF not meaningful for models with only one non-intercept term"))
+    # NB: The correlation matrix is positive definite and hence invertible
+    #     unless there is perfect rank deficiency, hence the warning in the docstring.
+    return diag(inv(Symmetric(m)))
+end
+
+"""
+    gvif(model::RegressionModel; scale=false)
+
+Compute the generalized variance inflation factor (GVIF).
+
+If `scale=true`, then each GVIF is scaled by the degrees of freedom
+for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``.
+
+Returns a vector of inflation factors computed for each term except
+for the intercept.
+In other words, the corresponding coefficients are `termnames(m)[2:end]`.
+
+The [GVIF](https://doi.org/10.2307/2290467)
+measures the increase in the variance of a (group of) parameter's estimate in a model
+with multiple parameters relative to the variance of a parameter's estimate in a
+model containing only that parameter. For continuous, numerical predictors, the GVIF
+is the same as the VIF, but for categorical predictors, the GVIF provides a single
+number for the entire group of contrast-coded coefficients associated with a categorical
+predictor.
+
+See also [`termnames`](@ref), [`vif`](@ref).
+
+!!! warning
+    This method will fail if there is (numerically) perfect multicollinearity,
+    i.e. rank deficiency (in the fixed effects). In that case though, the VIF
+    isn't particularly informative anyway.
+
+# References
+
+Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics.
+Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467
+"""
+function StatsAPI.gvif(model::RegressionModel; scale=false)
+    form = formula(model)
+    intercept = _find_intercept(form)
+    isnothing(intercept) &&
+        throw(ArgumentError("GVIF only defined for models with an intercept term"))
+    vc = vcov(model)
+    Base.require_one_based_indexing(vc)
+
+    vc = StatsBase.cov2cor!(copy(vc), stderror(model))
+    m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
+    size(m, 2) > 1 ||
+        throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term"))
+
+    tn = last(termnames(model))
+    tn = view(tn, axes(tn, 1) .!= intercept)
+    trms = get_matrix_term(form.rhs).terms
+    # MatrixTerms.terms is a tuple or vector so always 1-based indexing
+    trms = trms[1:length(trms) .!= intercept]
+
+    df = width.(trms)
+    vals = zeros(eltype(m), length(tn))
+    logdetm = logdet(m)
+    acc = 0
+    for idx in axes(vals, 1)
+        wt = df[idx]
+        trm_only = [acc < i <= (acc + wt) for i in axes(m, 2)]
+        trm_excl = .!trm_only
+        vals[idx] = exp(logdet(view(m, trm_only, trm_only)) +
+                        logdet(view(m, trm_excl, trm_excl)) -
+                        logdetm)
+        acc += wt
+    end
+
+    if scale
+        vals .= vals .^ (1 ./ (2 .* df))
+    end
+    return vals
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -23,11 +23,15 @@ my_tests = ["ambiguity.jl",
             "contrasts.jl",
             "extension.jl",
             "traits.jl",
-            "protect.jl"]
+            "protect.jl",
+            "vif.jl"]
 
 @testset "StatsModels" begin
     @testset "aqua" begin
-        Aqua.test_all(StatsModels; ambiguities=false)
+        # because VIF and GVIF are defined in StatsAPI for RegressionModel,
+        # which is also defined there, it's flagged as piracy. But
+        # we're the offical implementers so it's privateering.
+        Aqua.test_all(StatsModels; ambiguities=false, piracy=(treat_as_own=[vif, gvif],))
     end
 
     for tf in my_tests
diff --git a/test/vif.jl b/test/vif.jl
@@ -0,0 +1,106 @@
+using StatsAPI: RegressionModel
+
+struct MyRegressionModel <: RegressionModel
+end
+
+StatsAPI.modelmatrix(::MyRegressionModel) = [1 2; 3 4]
+StatsAPI.vcov(::MyRegressionModel) = [1 0; 0 1]
+
+struct MyRegressionModel2 <: RegressionModel
+end
+
+StatsAPI.modelmatrix(::MyRegressionModel2) = [1 2; 1 2]
+StatsAPI.vcov(::MyRegressionModel2) = [1 0; 0 1]
+
+struct MyRegressionModel3 <: RegressionModel
+end
+
+StatsAPI.modelmatrix(::MyRegressionModel3) = [1 2 3; 1 2 3]
+StatsAPI.vcov(::MyRegressionModel3) = [1 0 0; 0 1 0; 0 0 1]
+
+Base.@kwdef struct Duncan <: RegressionModel
+    coefnames::Vector{String}
+    coef::Vector{Float64}
+    stderror::Vector{Float64}
+    modelmatrix::Matrix{Float64}
+    vcov::Matrix{Float64}
+    formula::FormulaTerm
+end
+
+for f in [:coefnames, :coef, :stderror, :modelmatrix, :vcov]
+    @eval StatsAPI.$f(model::Duncan) = model.$f
+end 
+
+StatsModels.formula(model::Duncan) = model.formula
+
+@testset "VIF input checks" begin
+    # no intercept term
+    @test_throws ArgumentError vif(MyRegressionModel())
+
+    # only one non intercept term
+    @test_throws ArgumentError vif(MyRegressionModel2())
+
+    # vcov is identity, so the VIF is just 1
+    @test vif(MyRegressionModel3()) ≈ [1, 1]
+end
+
+# Reference values from car::vif in R:
+# > library(car)
+# > data(Duncan)
+# > lm1 = lm(prestige ~ 1 + income + education, Duncan)
+# > vif(lm1)
+#    income education 
+#    2.1049    2.1049 
+# > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
+# > vif(lm2)
+#               GVIF Df GVIF^(1/(2*Df))
+# income    2.209178  1        1.486330
+# education 5.297584  1        2.301648
+# type      5.098592  2        1.502666
+
+
+@testset "GVIF and VIF are the same for continuous predictors" begin
+    # these are copied from a GLM fit to the car::duncan data 
+    duncan2 = Duncan(; coefnames=["(Intercept)", "Income", "Education"],
+                     formula=term(:Prestige) ~  InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) + 
+                                                ContinuousTerm(:Education, 1,1,1,1),
+                     coef=[-6.064662922103356, 0.5987328215294941, 0.5458339094008812], 
+                     stderror=[4.271941174529124, 0.11966734981235407, 0.0982526413303999],
+                     # we actually don't need the whole matrix -- just enough to find an intercept
+                     modelmatrix=[1.0 62.0 86.0], 
+                     vcov=[18.2495    -0.151845    -0.150706
+                           -0.151845   0.0143203   -0.00851855
+                           -0.150706  -0.00851855   0.00965358])
+    @test vif(duncan2) ≈ [2.1049, 2.1049] atol=1e-5
+    # two different ways of calculating the same quantity
+    @test vif(duncan2) ≈ gvif(duncan2) 
+end
+
+@testset "GVIF" begin
+    cm = StatsModels.ContrastsMatrix(DummyCoding("bc", ["bc", "prof", "wc"]), ["bc", "prof", "wc"])
+    duncan3 = Duncan(; coefnames=["(Intercept)", "Income", "Education",  "Type: prof", "Type: wc"],
+                     formula=term(:Prestige) ~  InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) + 
+                                                ContinuousTerm(:Education, 1,1,1,1) + CategoricalTerm(:Type, cm),
+                     coef=[0.185028, 0.597546, 0.345319, 16.6575, -14.6611], 
+                     stderror=[3.71377, 0.0893553, 0.113609, 6.99301, 6.10877],
+                     # we actually don't need the whole matrix -- just enough to find an intercept
+                     modelmatrix=[1.0 62.0 86.0 1.0 0.0], 
+                     vcov=[13.7921    -0.115637    -0.257486    14.0947     7.9022
+                           -0.115637   0.00798437  -0.00292449  -0.126011  -0.109049
+                           -0.257486  -0.00292449   0.012907    -0.616651  -0.38812
+                           14.0947    -0.126011    -0.616651    48.9021    30.2139
+                            7.9022    -0.109049    -0.38812     30.2139    37.3171])
+    @test gvif(duncan3) ≈ [2.209178, 5.297584, 5.098592] atol=1e-4
+    @test gvif(duncan3; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-5
+end
+
+@testset "utils" begin
+   int = InterceptTerm{true}()
+   noint = InterceptTerm{false}()
+   xterm = term(:x)
+   @test StatsModels._find_intercept(xterm) === nothing
+   @test StatsModels._find_intercept(int) == 1
+   @test StatsModels._find_intercept(noint) === nothing
+   @test StatsModels._find_intercept(MatrixTerm((xterm, int))) == 2
+   @test StatsModels.get_matrix_term(MatrixTerm(MatrixTerm((xterm, int)))) == MatrixTerm((xterm, int))
+end