diff --git a/docs/src/colordifferences.md b/docs/src/colordifferences.md
index 5bcec487..a7edb765 100644
--- a/docs/src/colordifferences.md
+++ b/docs/src/colordifferences.md
@@ -10,7 +10,7 @@ julia> colordiff(colorant"red", colorant"blue")
 52.88136782250768
 
 julia> colordiff(HSV(0, 0.75, 0.5), HSL(0, 0.75, 0.5))
-19.48591066257135
+19.48591066257134
 ```
 
 ```julia
diff --git a/docs/src/colormapsandcolorscales.md b/docs/src/colormapsandcolorscales.md
index 641cf2e6..6b25930d 100644
--- a/docs/src/colormapsandcolorscales.md
+++ b/docs/src/colormapsandcolorscales.md
@@ -27,7 +27,7 @@ julia> range(c1, stop=c2, length=15)
  RGB{N0f8}(1.0,0.0,0.0)
  RGB{N0f8}(0.929,0.035,0.0)
  RGB{N0f8}(0.859,0.071,0.0)
- RGB{N0f8}(0.784,0.11,0.0)
+ RGB{N0f8}(0.784,0.106,0.0)
  RGB{N0f8}(0.714,0.145,0.0)
  RGB{N0f8}(0.643,0.18,0.0)
  RGB{N0f8}(0.573,0.216,0.0)
@@ -35,7 +35,7 @@ julia> range(c1, stop=c2, length=15)
  RGB{N0f8}(0.427,0.286,0.0)
  RGB{N0f8}(0.357,0.322,0.0)
  RGB{N0f8}(0.286,0.357,0.0)
- RGB{N0f8}(0.216,0.392,0.0)
+ RGB{N0f8}(0.216,0.396,0.0)
  RGB{N0f8}(0.141,0.431,0.0)
  RGB{N0f8}(0.071,0.467,0.0)
  RGB{N0f8}(0.0,0.502,0.0)
diff --git a/src/conversions.jl b/src/conversions.jl
index 78ac7940..d9984d35 100644
--- a/src/conversions.jl
+++ b/src/conversions.jl
@@ -87,9 +87,12 @@ correct_gamut(c::CV) where {T<:Union{N0f8,N0f16,N0f32,N0f64},
 correct_gamut(c::CV) where {CV<:TransparentRGB} =
     CV(clamp01(red(c)), clamp01(green(c)), clamp01(blue(c)), clamp01(alpha(c))) # for `hex`
 
-function srgb_compand(v::Fractional)
+@inline function srgb_compand(v)
+    F = typeof(0.5 * v)
+    vf = F(v)
+    vc = @fastmath max(vf, F(0.0031308))
     # `pow5_12` is an optimized function to get `v^(1/2.4)`
-    v <= 0.0031308 ? 12.92v : 1.055 * pow5_12(v) - 0.055
+    vf > F(0.0031308) ? muladd(F(1.055), F(pow5_12(vc)), F(-0.055)) : F(12.92) * vf
 end
 
 function _hsx_to_rgb(im::UInt8, v, n, m)
@@ -278,9 +281,11 @@ cnvt(::Type{HSI{T}}, c::Color) where {T} = cnvt(HSI{T}, convert(RGB{T}, c))
 # Everything to XYZ
 # -----------------
 
-function invert_srgb_compand(v::Fractional)
+@inline function invert_srgb_compand(v)
+    F = typeof(0.5 * v)
+    vf = F(v)
     # `pow12_5` is an optimized function to get `x^2.4`
-    v <= 0.04045 ? 1/12.92 * v : pow12_5(1/1.055 * (v + 0.055))
+    vf > F(0.04045) ? pow12_5(muladd(F(1000/1055), vf, F(55/1055))) : F(100/1292) * vf
 end
 
 # lookup table for `N0f8` (the extra two elements are for `Float32` splines)
@@ -306,7 +311,9 @@ function invert_srgb_compand(v::Float32)
 end
 
 function cnvt(::Type{XYZ{T}}, c::AbstractRGB) where T
-    r, g, b = invert_srgb_compand(red(c)), invert_srgb_compand(green(c)), invert_srgb_compand(blue(c))
+    r = invert_srgb_compand(red(c))
+    g = invert_srgb_compand(green(c))
+    b = invert_srgb_compand(blue(c))
     XYZ{T}(0.4124564*r + 0.3575761*g + 0.1804375*b,
            0.2126729*r + 0.7151522*g + 0.0721750*b,
            0.0193339*r + 0.1191920*g + 0.9503041*b)
@@ -320,20 +327,25 @@ function cnvt(::Type{XYZ{T}}, c::xyY) where T
 end
 
 
-const xyz_epsilon = 216 / 24389
-const xyz_kappa   = 24389 / 27
+const xyz_epsilon   = 216 / 24389 # (6/29)^3
+const xyz_kappa     = 24389 / 27  # (29/6)^3*8
+const xyz_kappa_inv = 27 / 24389
 
 function cnvt(::Type{XYZ{T}}, c::Lab, wp::XYZ = WP_DEFAULT) where T
     fy = (c.l + 16) / 116
-    fx = c.a / 500 + fy
+    fx = fy + c.a / 500
     fz = fy - c.b / 200
 
     fx3 = fx^3
+    fy3 = fy^3
     fz3 = fz^3
 
-    x = fx3 > xyz_epsilon ? fx3 : (116fx - 16) / xyz_kappa
-    y = c.l > xyz_kappa * xyz_epsilon ? ((c. l+ 16) / 116)^3 : c.l / xyz_kappa
-    z = fz3 > xyz_epsilon ? fz3 : (116fz - 16) / xyz_kappa
+    epsilon = oftype(fx3, xyz_epsilon)
+    kappa_inv = oftype(fx3, xyz_kappa_inv)
+
+    x = fx3 > epsilon ? fx3 : muladd(116, fx, -16) * kappa_inv
+    y = fy3 > epsilon ? fy3 : c.l * kappa_inv
+    z = fz3 > epsilon ? fz3 : muladd(116, fz, -16) * kappa_inv
 
     XYZ{T}(x*wp.x, y*wp.y, z*wp.z)
 end
@@ -349,15 +361,18 @@ end
 
 
 function cnvt(::Type{XYZ{T}}, c::Luv, wp::XYZ = WP_DEFAULT) where T
-    (u_wp, v_wp) = xyz_to_uv(wp)
+    c.l == zero(c.l) && return XYZ{T}(zero(T), zero(T), zero(T))
 
-    a = (52 * (c.l==0 ? zero(T) : c.l / (c.u + 13 * c.l * u_wp)) - 1) / 3
-    y = c.l > xyz_kappa * xyz_epsilon ? wp.y * ((c.l + 16) / 116)^3 : wp.y * c.l / xyz_kappa
-    b = -5y
-    d = y * (39 * (c.l==0 ? zero(T) : c.l / (c.v + 13 * c.l * v_wp)) - 5)
-    x = d==b ? zero(T) : (d - b) / (a + 1/3)
-    z = a * x + b + zero(T)
+    u_wp, v_wp = xyz_to_uv(wp)
 
+    ls = c.l * oftype(u_wp, xyz_kappa_inv)
+    y = c.l > 8 ? wp.y * ((c.l + 16) / 116)^3 : wp.y * ls
+
+    u = c.u / (13c.l) + u_wp
+    v = c.v / (13c.l) + v_wp
+    v4 = 4v
+    x = y * 9u / v4
+    z = y * (12 - 3u - 20v) / v4
     XYZ{T}(x, y, z)
 end
 
@@ -411,12 +426,16 @@ cnvt(::Type{xyY{T}}, c::Color) where {T} = cnvt(xyY{T}, convert(XYZ{T}, c))
 # Everything to Lab
 # -----------------
 
-function fxyz2lab(v)
-    v > xyz_epsilon ? cbrt(v) : (xyz_kappa * v + 16) / 116
+@inline function fxyz2lab(v)
+    ka = oftype(v, 841 / 108) # (29/6)^2 / 3 = xyz_kappa / 116
+    kb = oftype(v, 16 / 116) # 4/29
+    vc = @fastmath max(v, oftype(v, xyz_epsilon))
+    @fastmath min(cbrt01(vc), muladd(ka, v, kb))
 end
+
 function cnvt(::Type{Lab{T}}, c::XYZ, wp::XYZ = WP_DEFAULT) where T
-    fx, fy, fz = fxyz2lab(c.x / wp.x), fxyz2lab(c.y / wp.y), fxyz2lab(c.z / wp.z)
-    Lab{T}(116fy - 16, 500(fx - fy), 200(fy - fz))
+    f = mapc(fxyz2lab, mapc((x, y) -> x / y, c, wp))
+    Lab{T}(muladd(116, f.y, -16), 500(f.x - f.y), 200(f.y - f.z))
 end
 
 
@@ -501,10 +520,12 @@ function cnvt(::Type{Luv{T}}, c::XYZ, wp::XYZ = WP_DEFAULT) where T
     (u_, v_) = xyz_to_uv(c)
 
     y = c.y / wp.y
+    epsilon = oftype(y, xyz_epsilon)
+    kappa = oftype(y, xyz_kappa)
 
-    l = y > xyz_epsilon ? 116 * cbrt(y) - 16 : xyz_kappa * y
-    u = 13 * l * (u_ - u_wp) + zero(T)
-    v = 13 * l * (v_ - v_wp) + zero(T)
+    l = y > epsilon ? 116 * cbrt(y) - 16 : kappa * y
+    u = 13 * l * (u_ - u_wp)
+    v = 13 * l * (v_ - v_wp)
 
     Luv{T}(l, u, v)
 end
@@ -566,8 +587,8 @@ function cnvt(::Type{DIN99{T}}, c::Lab) where T
     cc = log(1+0.045*g)/(0.045*kch*ke)
 
     # DIN99 chromaticities
-    a99 = g > 0 ? cc * ee / g : zero(T)
-    b99 = g > 0 ? cc * f / g : zero(T)
+    a99 = g > 0 ? convert(T, cc / g * ee) : zero(T)
+    b99 = g > 0 ? convert(T, cc / g * f ) : zero(T)
 
     DIN99{T}(l99, a99, b99)
 
diff --git a/src/utilities.jl b/src/utilities.jl
index 944e2918..0e84a123 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -14,17 +14,109 @@ end
 # mod6 supports the input `x` in [-2^28, 2^29]
 mod6(::Type{T}, x::Int32) where T = unsafe_trunc(T, x - 6 * ((widemul(x, 0x2aaaaaaa) + Int64(0x20000000)) >> 0x20))
 
-# TODO: move `pow7` from "src/differences.jl" to here
+# Approximation of the reciprocal of the cube root, x^(-1/3).
+# assuming that x > 0.003, the conditional branches are omitted.
+@inline function rcbrt(x::Float64)
+    ix = reinterpret(UInt64, x)
+    e0 = (ix >> 0x34) % UInt32
+    ed = e0 ÷ 0x3
+    er = e0 - ed * 0x3
+    a = 0x000b_f2d7 - 0x0005_5718 * er
+    e = (UInt32(1363) - ed) << 0x14 | a
+    t1 = reinterpret(Float64, UInt64(e) << 0x20)
+    h1 = muladd(t1^2, -x * t1, 1.0)
+    t2 = muladd(@evalpoly(h1, 1/3, 2/9, 14/81), h1 * t1, t1)
+    h2 = muladd(t2^2, -x * t2, 1.0)
+    t3 = muladd(muladd(2/9, h2, 1/3), h2 * t2, t2)
+    reinterpret(Float64, reinterpret(UInt64, t3) & 0xffff_ffff_8000_0000)
+end
+@inline function rcbrt(x::Float32)
+    ix = reinterpret(UInt32, x)
+    e0 = ix >> 0x17 + 0x2
+    ed = e0 ÷ 0x3
+    er = e0 - ed * 0x3
+    a = 0x005f_9cbe - 0x002a_bd7d * er
+    t1 = reinterpret(Float32, (UInt32(169) - ed) << 0x17 | a)
+    h1 = muladd(t1^2, -x * t1, 1.0f0)
+    t2 = muladd(muladd(2/9f0, h1, 1/3f0), h1 * t1, t1)
+    h2 = muladd(t2^2, -x * t2, 1.0f0)
+    t3 = muladd(1/3f0, h2 * t2, t2)
+    reinterpret(Float32, reinterpret(UInt32, t3) & 0xffff_f000)
+end
+
+cbrt01(x) = cbrt(x)
+@inline function cbrt01(x::Float64)
+    r = rcbrt(x) # x^(-1/3)
+    h = muladd(r^2, -x * r, 1.0)
+    e = muladd(2/9, h, 1/3) * h * r
+    muladd(r, x * r, x * e * (r + r + e)) # x * x^(-2/3)
+end
+@inline function cbrt01(x::Float32)
+    r = Float64(rcbrt(x)) # x^(-1/3)
+    h = muladd(r^2, -Float64(x) * r, 1.0)
+    e = muladd(muladd(14/81, h, 2/9), h, 1/3) * h
+    Float32(1 / muladd(r, e, r))
+end
 
 pow3_4(x) = (y = @fastmath(sqrt(x)); y*@fastmath(sqrt(y))) # x^(3/4)
 
 # `pow5_12` is called from `srgb_compand`.
-# `cbrt` generates a function call, so there is little benefit of `@fastmath`.
 pow5_12(x) = pow3_4(x) / cbrt(x) # 5/12 == 1/2 + 1/4 - 1/3 == 3/4 - 1/3
+@inline function pow5_12(x::Float64)
+    @noinline _cbrt(x) = cbrt01(x)
+    p3_4 = pow3_4(x)
+    # x^(-1/6)
+    if x < 0.02
+        t0 = @evalpoly(x, 3.1366722556806232,
+            -221.51395962221136, 19788.889459114234, -905934.6541469148, 1.5928561711645417e7)
+    elseif x < 0.12
+        t0 = @evalpoly(x, 2.3135905865468644,
+            -26.43664640894651, 385.0146581045545, -2890.0920682466267, 8366.343115590817)
+    elseif x < 1.2
+        t0 = @evalpoly(x, 1.7047813285940905, -3.1261253501167308,
+            7.498744828350077, -10.100319516746419, 6.820601476522508, -1.7978894213531524)
+    else
+        return p3_4 / _cbrt(x)
+    end
+    # x^(-1/3)
+    t1 = t0 * t0
+    h1 = muladd(t1^2, -x * t1, 1.0)
+    t2 = muladd(h1, 1/3 * t1, t1)
+    h2 = muladd(t2^2, -x * t2, 1.0)
+    t2h = @evalpoly(h2, 1/3, 2/9, 14/81) * h2 * t2 # Taylor series of (1-h)^(-1/3)
+    # x^(3/4) * x^(-1/3)
+    muladd(p3_4, t2, p3_4 * t2h)
+end
+@inline function pow5_12(x::Float32)
+    # x^(-1/3)
+    rc = rcbrt(x)
+    rcx = -rc * x
+    rch = muladd(muladd(rc, x, rcx), -rc^2, muladd(rc^2, rcx, 1.0f0)) # 1 - x * rc^3
+    rce = muladd(2/9f0, rch, 1/3f0) * rch * rc
+    # x^(3/4)
+    p3_4_f64 = pow3_4(Float64(x))
+    p3_4r = reinterpret(Float64, reinterpret(UInt64, p3_4_f64) & 0xffffffff_e0000000)
+    p3_4 = Float32(p3_4r)
+    p3_4e = Float32(p3_4_f64 - p3_4r)
+    # x^(3/4) * x^(-1/3)
+    muladd(p3_4, rc, muladd(p3_4, rce, p3_4e * rc))
+end
 
 # `pow12_5` is called from `invert_srgb_compand`.
-# x^y ≈ exp(y*log(x)) ≈ exp2(y*log2(y)); the middle form is faster
-@noinline pow12_5(x) = x^2 * exp(0.4 * log(x)) # 12/5 == 2.4 == 2 + 0.4
+pow12_5(x) = pow12_5(Float64(x))
+pow12_5(x::BigFloat) = x^big"2.4"
+@inline function pow12_5(x::Float64)
+    # x^0.4
+    t1 = @evalpoly(@fastmath(min(x, 1.75)), 0.24295462640373672,
+        1.7489099720303518, -1.9919942887850166, 1.3197188815160004, -0.3257258790067756)
+    t2 = muladd(2/5, muladd(x / t1^2, @fastmath(sqrt(t1)), -t1), t1) # Newton's method
+    t3 = muladd(2/5, muladd(x / t2^2, @fastmath(sqrt(t2)), -t2), t2)
+    t4 = muladd(2/5, muladd(x / t3^2, @fastmath(sqrt(t3)), -t3), t3)
+    # x^0.4 * x^2
+    rx = reinterpret(Float64, reinterpret(UInt64, x) & 0xffffffff_f8000000) # hi
+    e = x - rx # lo
+    muladd(t4, rx^2, t4 * (rx + rx + e) * e)
+end
 
 
 # Linear interpolation in [a, b] where x is in [0,1],
@@ -208,9 +300,8 @@ function weighted_color_mean(w1::Real, c1::C, c2::C) where {Cb <: Union{HSV, HSL
 end
 function _weighted_color_mean(w1::Real, c1::Colorant{T1}, c2::Colorant{T2}) where {T1,T2}
     @fastmath min(w1, oneunit(w1) - w1) >= zero(w1) || throw(DomainError(w1, "`w1` must be in [0, 1]"))
-    weight1 = convert(promote_type(T1, T2), w1) # TODO: Consider the need for this
-    weight2 = oneunit(weight1) - weight1
-    mapc((x, y) -> convert(promote_type(T1, T2), muladd(weight1, x, weight2 * y)), c1, c2)
+    w2 = oneunit(w1) - w1
+    mapc((x, y) -> convert(promote_type(T1, T2), muladd(w1, x, w2 * y)), c1, c2)
 end
 function _weighted_color_mean(w1::Integer, c1::C, c2::C) where C <: Colorant
     (w1 & 0b1) === w1 || throw(DomainError(w1, "`w1` must be in [0, 1]"))
diff --git a/test/colordiff.jl b/test/colordiff.jl
index 4775aee1..32ac61e2 100644
--- a/test/colordiff.jl
+++ b/test/colordiff.jl
@@ -33,8 +33,8 @@ using Colors, Test
         ((50.0000,   2.5000,   0.0000), (50.0000,   3.2972,   0.0000),  1.0000),
         ((50.0000,   2.5000,   0.0000), (50.0000,   1.8634,   0.5757),  1.0000),
         ((50.0000,   2.5000,   0.0000), (50.0000,   3.2592,   0.3350),  1.0000),
-        ((60.2574, -34.0099,  36.2577), (60.4626, -34.1751,  39.4387),  1.2644),
-        ((63.0109, -31.0961,  -5.8663), (62.8187, -29.7946,  -4.0864),  1.2530),
+        ((60.2574, -34.0099,  36.2677), (60.4626, -34.1751,  39.4387),  1.2644),
+        ((63.0109, -31.0961,  -5.8663), (62.8187, -29.7946,  -4.0864),  1.2630),
         ((61.2901,   3.7196,  -5.3901), (61.4292,   2.2480,  -4.9620),  1.8731),
         ((35.0831, -44.1164,   3.7933), (35.0232, -40.0716,   1.5901),  1.8645),
         ((22.7233,  20.0904, -46.6940), (23.0331,  14.9730, -42.5619),  2.0373),
@@ -45,13 +45,14 @@ using Colors, Test
         ((2.0776,    0.0795,  -1.1350), ( 0.9033,  -0.0636,  -0.5514),  0.9082)]
 
 
-    eps_cdiff = 0.01
+    eps_cdiff = 0.00005
 
     @testset "CIEDE2000" begin
         metric = DE_2000()
         for (a, b, dexpect) in abds
-            @test abs(dexpect - colordiff(Lab(a...), Lab(b...); metric=metric)) < eps_cdiff
-            @test abs(dexpect - colordiff(Lab(b...), Lab(a...); metric=metric)) < eps_cdiff
+            a64, b64 = Lab(a...), Lab(b...)
+            @test colordiff(a64, b64; metric=metric) ≈ dexpect atol=eps_cdiff
+            @test colordiff(b64, a64; metric=metric) ≈ dexpect atol=eps_cdiff
         end
     end
 
diff --git a/test/utilities.jl b/test/utilities.jl
index 0f69fbeb..8d143090 100644
--- a/test/utilities.jl
+++ b/test/utilities.jl
@@ -2,6 +2,9 @@ using Colors, FixedPointNumbers, Test
 using InteractiveUtils # for `subtypes`
 
 @testset "Utilities" begin
+    @test Colors.cbrt01(0.6) ≈ cbrt(big"0.6") atol=eps(1.0)
+    @test Colors.cbrt01(0.6f0) === cbrt(0.6f0)
+
     # issue #351
     xs = max.(rand(1000), 1e-4)
     @noinline l_pow_x_y() = for x in xs; x^2.4 end
@@ -19,6 +22,12 @@ using InteractiveUtils # for `subtypes`
         @warn "Optimization technique in `pow5_12` may have the opposite effect."
     end
 
+    @test Colors.pow5_12(0.6) ≈ Colors.pow5_12(big"0.6") atol=1e-6
+    @test Colors.pow5_12(0.6f0) ≈ Colors.pow5_12(big"0.6") atol=1e-6
+    @test Colors.pow5_12(0.6N0f16) ≈ Colors.pow5_12(big"0.6") atol=1e-6
+    @test Colors.pow12_5(0.6) ≈ Colors.pow12_5(big"0.6") atol=1e-6
+    @test Colors.pow12_5(0.6N0f16) ≈ Colors.pow12_5(big"0.6") atol=1e-6
+
     @testset "hex" begin
         base_hex = @test_logs (:warn, r"Base\.hex\(c\) has been moved") Base.hex(RGB(1,0.5,0))
         @test base_hex == hex(RGB(1,0.5,0))
@@ -176,35 +185,33 @@ using InteractiveUtils # for `subtypes`
         end
         gray_b1 = Gray{Bool}(1)
         gray_b0 = Gray{Bool}(0)
-        @test weighted_color_mean(0, gray_b1, gray_b0) === gray_b0
-        @test weighted_color_mean(1, gray_b1, gray_b0) === gray_b1
-        @test_broken weighted_color_mean(0.5, gray_b1, gray_b1) === gray_b1
+        @test @inferred(weighted_color_mean(0, gray_b1, gray_b0)) === gray_b0
+        @test @inferred(weighted_color_mean(1, gray_b1, gray_b0)) === gray_b1
+        @test @inferred(weighted_color_mean(0.5, gray_b1, gray_b1)) === gray_b1
         @test_throws InexactError weighted_color_mean(0.5, gray_b1, gray_b0)
         @test_throws DomainError weighted_color_mean(-1, gray_b1, gray_b0)
 
         for C in parametric2
             for T in colorElementTypes
-                C == AGray32 && (T=N0f8)
-                c1 = C(T(0),T(1))
-                c2 = C(T(1),T(0))
-                @inferred weighted_color_mean(0.5,c1,c2)
-                @test weighted_color_mean(0.5,c1,c2) == C(T(1)-T(0.5),T(0.5))
+                c1 = C(T(0), T(1))
+                c2 = C(T(1), T(0))
+                cx = C(T(0.5), T(0.5))
+                @test @inferred(weighted_color_mean(0.5, c1, c2)) == cx
             end
         end
 
         for C in colortypes
             for T in (Float16,Float32,Float64,BigFloat)
                 if C<:Color
-                        c1 = C(T(1),T(1),T(0))
-                        c2 = C(T(0),T(1),T(1))
-                    @inferred weighted_color_mean(0.5,c1,c2)
-                    @test weighted_color_mean(0.5,c1,c2) == C(T(0.5),T(0.5)+T(1)-T(0.5),T(1)-T(0.5))
+                    c1 = C(T(1), T(1), T(0))
+                    c2 = C(T(0), T(1), T(1))
+                    cx = C(T(0.5), T(1.0), T(0.5))
+                    @test @inferred(weighted_color_mean(0.5, c1, c2)) == cx
                 else
-                        C == ARGB32 && (T=N0f8)
-                        c1 = C(T(1),T(1),T(0),T(1))
-                        c2 = C(T(0),T(1),T(1),T(0))
-                    @inferred weighted_color_mean(0.5,c1,c2)
-                    @test weighted_color_mean(0.5,c1,c2) == C(T(0.5),T(0.5)+T(1)-T(0.5),T(1)-T(0.5),T(0.5))
+                    c1 = C(T(1), T(1), T(0), T(0))
+                    c2 = C(T(0), T(1), T(0), T(1))
+                    cx = C(T(0.5), T(1.0), T(0.0), T(0.5))
+                    @test @inferred(weighted_color_mean(0.5, c1, c2)) == cx
                 end
             end
         end