Skip to content

Commit 2c73de3

Browse files
paldaynalimilan
andauthored
VIF and GVIF (#300)
* VIF and GVIF * manual entry * Julia 1.6 compat * remove dead code * VERSION check * tests * tolerance * let documenter find the StatsAPI docstring on its own * efficiency ftw * piracy check * fix _find_intercept with nesting * Aqua 0.7 * add R run for reference values as comment --------- Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
1 parent 1632bff commit 2c73de3

File tree

7 files changed

+247
-10
lines changed

7 files changed

+247
-10
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
1616
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1717

1818
[compat]
19-
Aqua = "0.6"
19+
Aqua = "0.7"
2020
CategoricalArrays = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
2121
DataAPI = "1.1"
2222
DataFrames = "1"
2323
DataStructures = "0.17, 0.18"
2424
ShiftedArrays = "1, 2"
25-
StatsAPI = "1"
25+
StatsAPI = "1.7"
2626
StatsBase = "0.33.5, 0.34"
2727
StatsFuns = "0.9, 1.0"
2828
Tables = "0.2, 1"

docs/make.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using Documenter, StatsModels
1+
using Documenter, StatsModels, StatsAPI
22

33
DocMeta.setdocmeta!(StatsModels, :DocTestSetup, :(using StatsModels, StatsBase); recursive=true)
44

@@ -15,7 +15,7 @@ makedocs(
1515
"Temporal variables and Time Series Terms" => "temporal_terms.md",
1616
"API documentation" => "api.md"
1717
],
18-
modules = [StatsModels],
18+
modules = [StatsModels, StatsAPI],
1919
doctestfilters = [r"([a-z]*) => \1", r"getfield\(.*##[0-9]+#[0-9]+"],
2020
strict=Documenter.except(:missing_docs)
2121
)

docs/src/api.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,12 @@ apply_schema
7474

7575
```@docs
7676
fit
77-
response
78-
modelmatrix
77+
gvif
7978
lrtest
8079
formula
80+
modelmatrix
81+
response
82+
vif
8183
```
8284

8385
### Traits

src/StatsModels.jl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ using DataStructures
99
using DataAPI
1010
using DataAPI: levels
1111
using Printf: @sprintf
12-
using StatsAPI: coefnames, fit, predict, dof
12+
using StatsAPI: coefnames, dof, fit, gvif, predict, vif
1313
using StatsFuns: chisqccdf
1414

1515
using SparseArrays
@@ -70,7 +70,10 @@ export
7070
omitsintercept,
7171
hasresponse,
7272

73-
lrtest
73+
lrtest,
74+
75+
vif,
76+
gvif
7477

7578
const SPECIALS = (:+, :&, :*, :~)
7679

@@ -84,6 +87,7 @@ include("formula.jl")
8487
include("modelframe.jl")
8588
include("statsmodel.jl")
8689
include("lrtest.jl")
90+
include("vif.jl")
8791
include("deprecated.jl")
8892

8993
end # module StatsModels

src/vif.jl

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# putting this behind a function barrier means that the model matrix
2+
# can potentially be freed immediately if it's large and constructed on the fly
3+
function _find_intercept(model::RegressionModel)
4+
modelmat = modelmatrix(model)
5+
cols = eachcol(modelmat)
6+
# collect is necessary for Julia 1.6
7+
# but it's :just: an array of references to views, so shouldn't be too
8+
# expensive
9+
@static if VERSION < v"1.7"
10+
cols = collect(cols)
11+
end
12+
return findfirst(Base.Fix1(all, isone), cols)
13+
end
14+
15+
_find_intercept(form::FormulaTerm) = _find_intercept(form.rhs)
16+
# we need these in case the RHS is a single term
17+
_find_intercept(::AbstractTerm) = nothing
18+
_find_intercept(::InterceptTerm{true}) = 1
19+
_find_intercept(m::MatrixTerm) = _find_intercept(m.terms)
20+
function _find_intercept(t::TupleTerm)
21+
return findfirst(!isnothing _find_intercept, t)
22+
end
23+
24+
# borrowed from Effects.jl
25+
function get_matrix_term(x)
26+
x = collect_matrix_terms(x)
27+
x = x isa MatrixTerm ? x : first(x)
28+
x isa MatrixTerm || throw(ArgumentError("couldn't extract matrix term from $x"))
29+
# this is a work around for some weirdness that happens in MixedModels.jl
30+
x = first(x.terms) isa MatrixTerm ? only(x.terms) : x
31+
return x
32+
end
33+
34+
function StatsAPI.vif(model::RegressionModel)
35+
vc = vcov(model)
36+
Base.require_one_based_indexing(vc)
37+
38+
intercept = _find_intercept(model)
39+
isnothing(intercept) &&
40+
throw(ArgumentError("VIF is only defined for models with an intercept term"))
41+
42+
# copy just in case vc was a reference to an internal structure
43+
vc = StatsBase.cov2cor!(copy(vc), stderror(model))
44+
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
45+
size(m, 2) > 1 ||
46+
throw(ArgumentError("VIF not meaningful for models with only one non-intercept term"))
47+
# NB: The correlation matrix is positive definite and hence invertible
48+
# unless there is perfect rank deficiency, hence the warning in the docstring.
49+
return diag(inv(Symmetric(m)))
50+
end
51+
52+
"""
53+
gvif(model::RegressionModel; scale=false)
54+
55+
Compute the generalized variance inflation factor (GVIF).
56+
57+
If `scale=true`, then each GVIF is scaled by the degrees of freedom
58+
for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``.
59+
60+
Returns a vector of inflation factors computed for each term except
61+
for the intercept.
62+
In other words, the corresponding coefficients are `termnames(m)[2:end]`.
63+
64+
The [GVIF](https://doi.org/10.2307/2290467)
65+
measures the increase in the variance of a (group of) parameter's estimate in a model
66+
with multiple parameters relative to the variance of a parameter's estimate in a
67+
model containing only that parameter. For continuous, numerical predictors, the GVIF
68+
is the same as the VIF, but for categorical predictors, the GVIF provides a single
69+
number for the entire group of contrast-coded coefficients associated with a categorical
70+
predictor.
71+
72+
See also [`termnames`](@ref), [`vif`](@ref).
73+
74+
!!! warning
75+
This method will fail if there is (numerically) perfect multicollinearity,
76+
i.e. rank deficiency (in the fixed effects). In that case though, the VIF
77+
isn't particularly informative anyway.
78+
79+
# References
80+
81+
Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics.
82+
Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467
83+
"""
84+
function StatsAPI.gvif(model::RegressionModel; scale=false)
85+
form = formula(model)
86+
intercept = _find_intercept(form)
87+
isnothing(intercept) &&
88+
throw(ArgumentError("GVIF only defined for models with an intercept term"))
89+
vc = vcov(model)
90+
Base.require_one_based_indexing(vc)
91+
92+
vc = StatsBase.cov2cor!(copy(vc), stderror(model))
93+
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
94+
size(m, 2) > 1 ||
95+
throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term"))
96+
97+
tn = last(termnames(model))
98+
tn = view(tn, axes(tn, 1) .!= intercept)
99+
trms = get_matrix_term(form.rhs).terms
100+
# MatrixTerms.terms is a tuple or vector so always 1-based indexing
101+
trms = trms[1:length(trms) .!= intercept]
102+
103+
df = width.(trms)
104+
vals = zeros(eltype(m), length(tn))
105+
logdetm = logdet(m)
106+
acc = 0
107+
for idx in axes(vals, 1)
108+
wt = df[idx]
109+
trm_only = [acc < i <= (acc + wt) for i in axes(m, 2)]
110+
trm_excl = .!trm_only
111+
vals[idx] = exp(logdet(view(m, trm_only, trm_only)) +
112+
logdet(view(m, trm_excl, trm_excl)) -
113+
logdetm)
114+
acc += wt
115+
end
116+
117+
if scale
118+
vals .= vals .^ (1 ./ (2 .* df))
119+
end
120+
return vals
121+
end

test/runtests.jl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,15 @@ my_tests = ["ambiguity.jl",
2323
"contrasts.jl",
2424
"extension.jl",
2525
"traits.jl",
26-
"protect.jl"]
26+
"protect.jl",
27+
"vif.jl"]
2728

2829
@testset "StatsModels" begin
2930
@testset "aqua" begin
30-
Aqua.test_all(StatsModels; ambiguities=false)
31+
# because VIF and GVIF are defined in StatsAPI for RegressionModel,
32+
# which is also defined there, it's flagged as piracy. But
33+
# we're the offical implementers so it's privateering.
34+
Aqua.test_all(StatsModels; ambiguities=false, piracy=(treat_as_own=[vif, gvif],))
3135
end
3236

3337
for tf in my_tests

test/vif.jl

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
using StatsAPI: RegressionModel
2+
3+
struct MyRegressionModel <: RegressionModel
4+
end
5+
6+
StatsAPI.modelmatrix(::MyRegressionModel) = [1 2; 3 4]
7+
StatsAPI.vcov(::MyRegressionModel) = [1 0; 0 1]
8+
9+
struct MyRegressionModel2 <: RegressionModel
10+
end
11+
12+
StatsAPI.modelmatrix(::MyRegressionModel2) = [1 2; 1 2]
13+
StatsAPI.vcov(::MyRegressionModel2) = [1 0; 0 1]
14+
15+
struct MyRegressionModel3 <: RegressionModel
16+
end
17+
18+
StatsAPI.modelmatrix(::MyRegressionModel3) = [1 2 3; 1 2 3]
19+
StatsAPI.vcov(::MyRegressionModel3) = [1 0 0; 0 1 0; 0 0 1]
20+
21+
Base.@kwdef struct Duncan <: RegressionModel
22+
coefnames::Vector{String}
23+
coef::Vector{Float64}
24+
stderror::Vector{Float64}
25+
modelmatrix::Matrix{Float64}
26+
vcov::Matrix{Float64}
27+
formula::FormulaTerm
28+
end
29+
30+
for f in [:coefnames, :coef, :stderror, :modelmatrix, :vcov]
31+
@eval StatsAPI.$f(model::Duncan) = model.$f
32+
end
33+
34+
StatsModels.formula(model::Duncan) = model.formula
35+
36+
@testset "VIF input checks" begin
37+
# no intercept term
38+
@test_throws ArgumentError vif(MyRegressionModel())
39+
40+
# only one non intercept term
41+
@test_throws ArgumentError vif(MyRegressionModel2())
42+
43+
# vcov is identity, so the VIF is just 1
44+
@test vif(MyRegressionModel3()) [1, 1]
45+
end
46+
47+
# Reference values from car::vif in R:
48+
# > library(car)
49+
# > data(Duncan)
50+
# > lm1 = lm(prestige ~ 1 + income + education, Duncan)
51+
# > vif(lm1)
52+
# income education
53+
# 2.1049 2.1049
54+
# > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
55+
# > vif(lm2)
56+
# GVIF Df GVIF^(1/(2*Df))
57+
# income 2.209178 1 1.486330
58+
# education 5.297584 1 2.301648
59+
# type 5.098592 2 1.502666
60+
61+
62+
@testset "GVIF and VIF are the same for continuous predictors" begin
63+
# these are copied from a GLM fit to the car::duncan data
64+
duncan2 = Duncan(; coefnames=["(Intercept)", "Income", "Education"],
65+
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
66+
ContinuousTerm(:Education, 1,1,1,1),
67+
coef=[-6.064662922103356, 0.5987328215294941, 0.5458339094008812],
68+
stderror=[4.271941174529124, 0.11966734981235407, 0.0982526413303999],
69+
# we actually don't need the whole matrix -- just enough to find an intercept
70+
modelmatrix=[1.0 62.0 86.0],
71+
vcov=[18.2495 -0.151845 -0.150706
72+
-0.151845 0.0143203 -0.00851855
73+
-0.150706 -0.00851855 0.00965358])
74+
@test vif(duncan2) [2.1049, 2.1049] atol=1e-5
75+
# two different ways of calculating the same quantity
76+
@test vif(duncan2) gvif(duncan2)
77+
end
78+
79+
@testset "GVIF" begin
80+
cm = StatsModels.ContrastsMatrix(DummyCoding("bc", ["bc", "prof", "wc"]), ["bc", "prof", "wc"])
81+
duncan3 = Duncan(; coefnames=["(Intercept)", "Income", "Education", "Type: prof", "Type: wc"],
82+
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
83+
ContinuousTerm(:Education, 1,1,1,1) + CategoricalTerm(:Type, cm),
84+
coef=[0.185028, 0.597546, 0.345319, 16.6575, -14.6611],
85+
stderror=[3.71377, 0.0893553, 0.113609, 6.99301, 6.10877],
86+
# we actually don't need the whole matrix -- just enough to find an intercept
87+
modelmatrix=[1.0 62.0 86.0 1.0 0.0],
88+
vcov=[13.7921 -0.115637 -0.257486 14.0947 7.9022
89+
-0.115637 0.00798437 -0.00292449 -0.126011 -0.109049
90+
-0.257486 -0.00292449 0.012907 -0.616651 -0.38812
91+
14.0947 -0.126011 -0.616651 48.9021 30.2139
92+
7.9022 -0.109049 -0.38812 30.2139 37.3171])
93+
@test gvif(duncan3) [2.209178, 5.297584, 5.098592] atol=1e-4
94+
@test gvif(duncan3; scale=true) [1.486330, 2.301648, 1.502666] atol=1e-5
95+
end
96+
97+
@testset "utils" begin
98+
int = InterceptTerm{true}()
99+
noint = InterceptTerm{false}()
100+
xterm = term(:x)
101+
@test StatsModels._find_intercept(xterm) === nothing
102+
@test StatsModels._find_intercept(int) == 1
103+
@test StatsModels._find_intercept(noint) === nothing
104+
@test StatsModels._find_intercept(MatrixTerm((xterm, int))) == 2
105+
@test StatsModels.get_matrix_term(MatrixTerm(MatrixTerm((xterm, int)))) == MatrixTerm((xterm, int))
106+
end

0 commit comments

Comments
 (0)