Skip to content

Commit b58f286

Browse files
authored
Univariatize even more. (#86)
* Univariatize even more.
1 parent 27dd9b6 commit b58f286

File tree

10 files changed

+311
-260
lines changed

10 files changed

+311
-260
lines changed

src/LineSearches.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ module LineSearches
55
using Parameters, NaNMath
66

77
import NLSolversBase
8+
import NLSolversBase: AbstractObjective
89
import Base.clear!
910

1011
export LineSearchResults, LineSearchException
@@ -26,6 +27,19 @@ function make_ϕ(df, x_new, x, s)
2627
end
2728
ϕ
2829
end
30+
function make_ϕdϕ(df, x_new, x, s)
31+
function ϕdϕ(α)
32+
# Move a distance of alpha in the direction of s
33+
x_new .= x .+ α.*s
34+
35+
# Evaluate ∇f(x+α*s)
36+
NLSolversBase.value_gradient!(df, x_new)
37+
38+
# Calculate ϕ(a_i), ϕ'(a_i)
39+
NLSolversBase.value(df), vecdot(NLSolversBase.gradient(df), s)
40+
end
41+
ϕdϕ
42+
end
2943
function make_ϕ_dϕ(df, x_new, x, s)
3044
function (α)
3145
# Move a distance of alpha in the direction of s
@@ -62,6 +76,19 @@ function make_ϕ_dϕ_ϕdϕ(df, x_new, x, s)
6276
end
6377
make_ϕ(df, x_new, x, s), dϕ, ϕdϕ
6478
end
79+
function make_ϕ_ϕdϕ(df, x_new, x, s)
80+
function ϕdϕ(α)
81+
# Move a distance of alpha in the direction of s
82+
x_new .= x .+ α.*s
83+
84+
# Evaluate ∇f(x+α*s)
85+
NLSolversBase.value_gradient!(df, x_new)
86+
87+
# Calculate ϕ'(a_i)
88+
NLSolversBase.value(df), vecdot(NLSolversBase.gradient(df), s)
89+
end
90+
make_ϕ(df, x_new, x, s), ϕdϕ
91+
end
6592

6693
include("types.jl")
6794

src/backtracking.jl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,23 @@ This is a modification of the algorithm described in Nocedal Wright (2nd ed), Se
1717
maxstep::TF = Inf
1818
end
1919

20-
function (ls::BackTracking)(df, x::AbstractArray{T}, s::AbstractArray{T},
21-
x_new::AbstractArray{T},
22-
ϕ_0, dϕ_0, α_0::Tα = 1.0, alphamax = convert(T, Inf)) where {T, Tα}
20+
function (ls::BackTracking)(df::AbstractObjective, x::AbstractArray{T}, s::AbstractArray{T},
21+
α_0::Tα = 1.0, x_new::AbstractArray{T} = similar(x), ϕ_0 = nothing, dϕ_0 = nothing, alphamax = convert(T, Inf)) where {T, Tα}
22+
ϕ, dϕ = make_ϕ_dϕ(df, x_new, x, s)
2323

24-
@unpack c_1, ρ_hi, ρ_lo, iterations, order, maxstep = ls
24+
if ϕ_0 == nothing
25+
ϕ_0 = ϕ(α_0)
26+
end
27+
if dϕ_0 == nothing
28+
dϕ_0 = ϕ(α_0)
29+
end
2530

26-
ϕ = make_ϕ(df, x_new, x, s)
31+
ls(ϕ, x, s, α_0, ϕ_0, dϕ_0, alphamax)
32+
end
33+
function (ls::BackTracking)(ϕ, x::AbstractArray{T}, s::AbstractArray{T}, α_0::Tα,
34+
ϕ_0, dϕ_0, alphamax = convert(T, Inf)) where {T, Tα}
35+
36+
@unpack c_1, ρ_hi, ρ_lo, iterations, order, maxstep = ls
2737

2838
iterfinitemax = -log2(eps(T))
2939

@@ -102,5 +112,5 @@ function (ls::BackTracking)(df, x::AbstractArray{T}, s::AbstractArray{T},
102112
ϕx_0, ϕx_1 = ϕx_1, ϕ(α_2)
103113
end
104114

105-
return α_2
115+
return α_2, ϕx_1
106116
end

src/hagerzhang.jl

Lines changed: 34 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ Conjugate gradient line search implementation from:
8080
conjugate gradient method with guaranteed descent. ACM
8181
Transactions on Mathematical Software 32: 113–137.
8282
"""
83-
@with_kw struct HagerZhang{T}
83+
@with_kw struct HagerZhang{T, Tm}
8484
delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition
8585
sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent)
8686
alphamax::T = Inf
@@ -90,34 +90,27 @@ Conjugate gradient line search implementation from:
9090
linesearchmax::Int = 50
9191
psi3::T = 0.1
9292
display::Int = 0
93+
mayterminate::Tm = Ref{Bool}(false)
9394
end
9495

95-
(ls::HagerZhang)(args...) = _hagerzhang!(args...,
96-
ls.delta, ls.sigma, ls.alphamax, ls.rho, ls.epsilon, ls.gamma,
97-
ls.linesearchmax, ls.psi3, ls.display)
98-
96+
function (ls::HagerZhang)(df::AbstractObjective, x::AbstractArray{T},
97+
s::AbstractArray{T}, α::Real,
98+
x_new::AbstractArray{T}, phi_0::Real, dphi_0::Real) where T
99+
ϕ, ϕdϕ = make_ϕ_ϕdϕ(df, x_new, x, s)
100+
ls(ϕ, ϕdϕ, x, s, α::Real, phi_0, dphi_0)
101+
end
99102

100-
function _hagerzhang!(df,
103+
function (ls::HagerZhang)(ϕ, ϕdϕ,
101104
x::AbstractArray{T},
102105
s::AbstractArray{T},
103-
x_new::AbstractArray{T},
104-
phi_0,
105-
dphi_0,
106106
c::Real,
107-
mayterminate::Bool,
108-
delta::Real = DEFAULTDELTA,
109-
sigma::Real = DEFAULTSIGMA,
110-
alphamax::Real = convert(T,Inf),
111-
rho::Real = convert(T,5),
112-
epsilon::Real = convert(T,1e-6),
113-
gamma::Real = convert(T,0.66),
114-
linesearchmax::Integer = 50,
115-
psi3::Real = convert(T,0.1),
116-
display::Integer = 0) where T
117-
118-
ϕ, dϕ, ϕdϕ = make_ϕ_dϕ_ϕdϕ(df, x_new, x, s)
119-
120-
# Prevent values of `x_new` that are likely to make
107+
phi_0::Real,
108+
dphi_0::Real) where T
109+
110+
@unpack delta, sigma, alphamax, rho, epsilon, gamma,
111+
linesearchmax, psi3, display, mayterminate = ls
112+
113+
# Prevent values of x_new = x+αs that are likely to make
121114
# ϕ(x_new) infinite
122115
iterfinitemax::Int = ceil(Int, -log2(eps(T)))
123116
alphas = [T(0.0)] # for bisection
@@ -134,27 +127,29 @@ function _hagerzhang!(df,
134127
phi_c, dphi_c = ϕdϕ(c)
135128
iterfinite = 1
136129
while !(isfinite(phi_c) && isfinite(dphi_c)) && iterfinite < iterfinitemax
137-
mayterminate = false
130+
mayterminate[] = false
138131
iterfinite += 1
139132
c *= psi3
140133
phi_c, dphi_c = ϕdϕ(c)
141134
end
142135
if !(isfinite(phi_c) && isfinite(dphi_c))
143136
warn("Failed to achieve finite new evaluation point, using alpha=0")
144-
return zero(T) # phi_0
137+
mayterminate[] = false # reset in case another initial guess is used next
138+
return T(0.0), ϕ(T(0.0)) # phi_0
145139
end
146140
push!(alphas, c)
147141
push!(values, phi_c)
148142
push!(slopes, dphi_c)
149143

150144
# If c was generated by quadratic interpolation, check whether it
151145
# satisfies the Wolfe conditions
152-
if mayterminate &&
146+
if mayterminate[] &&
153147
satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma)
154148
if display & LINESEARCH > 0
155149
println("Wolfe condition satisfied on point alpha = ", c)
156150
end
157-
return c # phi_c
151+
mayterminate[] = false # reset in case another initial guess is used next
152+
return c, phi_c # phi_c
158153
end
159154
# Initial bracketing step (HZ, stages B0-B3)
160155
isbracketed = false
@@ -204,7 +199,8 @@ function _hagerzhang!(df,
204199
", cold = ", cold, ", new c = ", c)
205200
end
206201
if c == cold || nextfloat(c) >= alphamax
207-
return cold
202+
mayterminate[] = false # reset in case another initial guess is used next
203+
return cold, dphi_c
208204
end
209205
end
210206
phi_c, dphi_c = ϕdϕ(c)
@@ -219,7 +215,8 @@ function _hagerzhang!(df,
219215
phi_c, dphi_c = ϕdϕ(c)
220216
end
221217
if !(isfinite(phi_c) && isfinite(dphi_c))
222-
return cold
218+
mayterminate[] = false # reset in case another initial guess is used next
219+
return cold, ϕ(cold)
223220
elseif dphi_c < 0 && c == alphamax
224221
# We're on the edge of the allowed region, and the
225222
# value is still decreasing. This can be due to
@@ -234,7 +231,8 @@ function _hagerzhang!(df,
234231
", phi_c = ", phi_c,
235232
", dphi_c = ", dphi_c)
236233
end
237-
return c
234+
mayterminate[] = false # reset in case another initial guess is used next
235+
return c, phi_c
238236
end
239237
push!(alphas, c)
240238
push!(values, phi_c)
@@ -255,11 +253,13 @@ function _hagerzhang!(df,
255253
", phi(b) = ", values[ib])
256254
end
257255
if b - a <= eps(b)
258-
return a # lsr.value[ia]
256+
mayterminate[] = false # reset in case another initial guess is used next
257+
return a, values[ia] # lsr.value[ia]
259258
end
260259
iswolfe, iA, iB = secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display)
261260
if iswolfe
262-
return alphas[iA] # lsr.value[iA]
261+
mayterminate[] = false # reset in case another initial guess is used next
262+
return alphas[iA], values[iA] # lsr.value[iA]
263263
end
264264
A = alphas[iA]
265265
B = alphas[iB]
@@ -273,7 +273,8 @@ function _hagerzhang!(df,
273273
if display & LINESEARCH > 0
274274
println("Linesearch: secant suggests it's flat")
275275
end
276-
return A
276+
mayterminate[] = false # reset in case another initial guess is used next
277+
return A, values[iA]
277278
end
278279
ia = iA
279280
ib = iB
@@ -504,136 +505,3 @@ function bisect!(ϕdϕ,
504505
end
505506
return ia, ib
506507
end
507-
508-
"""
509-
Initial step size algorithm from
510-
W. W. Hager and H. Zhang (2006) Algorithm 851: CG_DESCENT, a
511-
conjugate gradient method with guaranteed descent. ACM
512-
Transactions on Mathematical Software 32: 113–137.
513-
514-
If α0 is NaN, then procedure I0 is called at the first iteration,
515-
otherwise, we select according to procedure I1-2, with starting value α0.
516-
"""
517-
@with_kw struct InitialHagerZhang{T}
518-
ψ0::T = 0.01
519-
ψ1::T = 0.2
520-
ψ2::T = 2.0
521-
ψ3::T = 0.1
522-
αmax::T = Inf
523-
α0::T = 1.0 # Initial alpha guess. NaN => algorithm calculates
524-
verbose::Bool = false
525-
end
526-
527-
function (is::InitialHagerZhang)(state, phi_0, dphi_0, df)
528-
529-
530-
if isnan(state.f_x_previous) && isnan(is.α0)
531-
# If we're at the first iteration (f_x_previous is NaN)
532-
# and the user has not provided an initial step size (is.α0 is NaN),
533-
# then we
534-
# pick the initial step size according to HZ #I0
535-
state.alpha = _hzI0(state.x, NLSolversBase.gradient(df),
536-
NLSolversBase.value(df),
537-
convert(eltype(state.x), is.ψ0)) # Hack to deal with type instability between is{T} and state.x
538-
state.mayterminate = false
539-
else
540-
# Pick the initial step size according to HZ #I1-2
541-
state.alpha, state.mayterminate =
542-
_hzI12(state.alpha, df, state.x, state.s, state.x_ls, phi_0, dphi_0,
543-
is.ψ1, is.ψ2, is.ψ3, is.αmax, is.verbose)
544-
end
545-
return state.alpha
546-
end
547-
548-
# Pick the initial step size (HZ #I1-I2)
549-
function _hzI12(alpha::T,
550-
df,
551-
x::AbstractArray{T},
552-
s::AbstractArray{T},
553-
x_new::AbstractArray{T},
554-
phi_0::T,
555-
dphi_0::T,
556-
psi1::Real = convert(T,0.2),
557-
psi2::Real = convert(T,2.0),
558-
psi3::Real = convert(T,0.1),
559-
alphamax::Real = convert(T, Inf),
560-
verbose::Bool = false) where T
561-
562-
563-
ϕ = make_ϕ(df, x_new, x, s)
564-
565-
# Prevent values of `x_new` that are likely to make
566-
# ϕ(x_new) infinite
567-
iterfinitemax::Int = ceil(Int, -log2(eps(T)))
568-
569-
alphatest = psi1 * alpha
570-
alphatest = min(alphatest, alphamax)
571-
572-
phitest = ϕ(alphatest)
573-
574-
iterfinite = 1
575-
while !isfinite(phitest)
576-
alphatest = psi3 * alphatest
577-
578-
phitest = ϕ(alphatest)
579-
580-
iterfinite += 1
581-
if iterfinite >= iterfinitemax
582-
return zero(T), true
583-
# error("Failed to achieve finite test value; alphatest = ", alphatest)
584-
end
585-
end
586-
a = ((phitest-phi_0)/alphatest - dphi_0)/alphatest # quadratic fit
587-
if verbose == true
588-
println("quadfit: alphatest = ", alphatest,
589-
", phi_0 = ", phi_0,
590-
", phitest = ", phitest,
591-
", quadcoef = ", a)
592-
end
593-
mayterminate = false
594-
if isfinite(a) && a > 0 && phitest <= phi_0
595-
alpha = -dphi_0 / 2 / a # if convex, choose minimum of quadratic
596-
if alpha == 0
597-
error("alpha is zero. dphi_0 = ", dphi_0, ", phi_0 = ", phi_0, ", phitest = ", phitest, ", alphatest = ", alphatest, ", a = ", a)
598-
end
599-
if alpha <= alphamax
600-
mayterminate = true
601-
else
602-
alpha = alphamax
603-
mayterminate = false
604-
end
605-
if verbose == true
606-
println("alpha guess (quadratic): ", alpha,
607-
",(mayterminate = ", mayterminate, ")")
608-
end
609-
else
610-
if phitest > phi_0
611-
alpha = alphatest
612-
else
613-
alpha *= psi2 # if not convex, expand the interval
614-
end
615-
end
616-
alpha = min(alphamax, alpha)
617-
if verbose == true
618-
println("alpha guess (expand): ", alpha)
619-
end
620-
return alpha, mayterminate
621-
end
622-
623-
# Generate initial guess for step size (HZ, stage I0)
624-
function _hzI0(x::AbstractArray{T},
625-
gr::AbstractArray{T},
626-
f_x::T,
627-
psi0::T = convert(T,0.01)) where T
628-
alpha = one(T)
629-
gr_max = maximum(abs, gr)
630-
if gr_max != 0.0
631-
x_max = maximum(abs, x)
632-
if x_max != 0.0
633-
alpha = psi0 * x_max / gr_max
634-
elseif f_x != 0.0
635-
alpha = psi0 * abs(f_x) / vecnorm(gr)
636-
end
637-
end
638-
return alpha
639-
end

0 commit comments

Comments
 (0)