Remove greek-letter keyword arguments (#2139)

mcabbott · web-flow · commit d022b9ffded5 · 2023-04-29T13:44:21.000-04:00
* rm greek letter keywords

* step 2, with _greek_ascii_depwarn

* also for normalisation layers

* move _greek_ascii_depwarn

* wording
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -70,11 +70,11 @@ include("loading.jl")
 include("outputsize.jl")
 export @autosize
 
+include("deprecations.jl")
+
 include("losses/Losses.jl")
 using .Losses
 
-include("deprecations.jl")
-
 include("cuda/cuda.jl")
 
 end # module
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -203,6 +203,17 @@ function trainmode!(m, active::Bool)
   testmode!(m, !active)
 end
 
+# Greek-letter keywords deprecated in Flux 0.13
+# Arguments (old => new, :function, "β" => "beta")
+function _greek_ascii_depwarn(βbeta::Pair, func = :loss, names = "" => "")
+  Base.depwarn("""function $func no longer accepts greek-letter keyword $(names.first)
+    please use ascii $(names.second) instead""", func)
+  βbeta.first
+end
+_greek_ascii_depwarn(βbeta::Pair{Nothing}, _...) = βbeta.second
+
+ChainRulesCore.@non_differentiable _greek_ascii_depwarn(::Any...)
+
 
 # v0.14 deprecations
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -152,11 +152,12 @@ testmode!(m::AlphaDropout, mode=true) =
   (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 """
-    LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5)
+    LayerNorm(size..., λ=identity; affine=true, eps=1f-5)
 
 A [normalisation layer](https://arxiv.org/abs/1607.06450) designed to be
 used with recurrent hidden states.
 The argument `size` should be an integer or a tuple of integers.
+
 In the forward pass, the layer normalises the mean and standard
 deviation of the input, then applies the elementwise activation `λ`.
 The input is normalised along the first `length(size)` dimensions
@@ -190,9 +191,10 @@ struct LayerNorm{F,D,T,N}
   affine::Bool
 end
 
-function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, ϵ::Real=1f-5)
+function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, eps::Real=1f-5, ϵ=nothing)
+  ε = _greek_ascii_depwarn(ϵ => eps, :LayerNorm, "ϵ" => "eps")
   diag = affine ? Scale(size..., λ) : λ!=identity ? Base.Fix1(broadcast, λ) : identity
-  return LayerNorm(λ, diag, ϵ, size, affine)
+  return LayerNorm(λ, diag, ε, size, affine)
 end
 LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...)
 LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end]; kw...)
@@ -269,7 +271,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...)
     BatchNorm(channels::Integer, λ=identity;
               initβ=zeros32, initγ=ones32,
               affine=true, track_stats=true, active=nothing,
-              ϵ=1f-5, momentum= 0.1f0)
+              eps=1f-5, momentum= 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.
 `channels` should be the size of the channel dimension in your data (see below).
@@ -321,16 +323,18 @@ end
 
 function BatchNorm(chs::Int, λ=identity;
           initβ=zeros32, initγ=ones32,
-          affine=true, track_stats=true, active::Union{Bool,Nothing}=nothing,
-          ϵ=1f-5, momentum=0.1f0)
+          affine::Bool=true, track_stats::Bool=true, active::Union{Bool,Nothing}=nothing,
+          eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
+
+  ε = _greek_ascii_depwarn(ϵ => eps, :BatchNorm, "ϵ" => "eps")
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
   μ = track_stats ? zeros32(chs) : nothing
   σ² = track_stats ? ones32(chs) : nothing
 
   return BatchNorm(λ, β, γ,
-            μ, σ², ϵ, momentum,
+            μ, σ², ε, momentum,
             affine, track_stats,
             active, chs)
 end
@@ -361,7 +365,7 @@ end
     InstanceNorm(channels::Integer, λ=identity;
                  initβ=zeros32, initγ=ones32,
                  affine=false, track_stats=false,
-                 ϵ=1f-5, momentum=0.1f0)
+                 eps=1f-5, momentum=0.1f0)
 
 [Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
 `channels` should be the size of the channel dimension in your data (see below).
@@ -411,16 +415,18 @@ end
 
 function InstanceNorm(chs::Int, λ=identity;
                     initβ=zeros32, initγ=ones32,
-                    affine=false, track_stats=false, active::Union{Bool,Nothing}=nothing,
-                    ϵ=1f-5, momentum=0.1f0)
+                    affine::Bool=false, track_stats::Bool=false, active::Union{Bool,Nothing}=nothing,
+                    eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
+
+  ε = _greek_ascii_depwarn(ϵ => eps, :InstanceNorm, "ϵ" => "eps")
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
   μ = track_stats ? zeros32(chs) : nothing
   σ² = track_stats ? ones32(chs) : nothing
 
   return InstanceNorm(λ, β, γ,
-            μ, σ², ϵ, momentum,
+            μ, σ², ε, momentum,
             affine, track_stats,
             active, chs)
 end
@@ -450,7 +456,7 @@ end
     GroupNorm(channels::Integer, G::Integer, λ=identity;
               initβ=zeros32, initγ=ones32,
               affine=true, track_stats=false,
-              ϵ=1f-5, momentum=0.1f0)
+              eps=1f-5, momentum=0.1f0)
 
 [Group Normalization](https://arxiv.org/abs/1803.08494) layer.
 
@@ -508,12 +514,13 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
 function GroupNorm(chs::Int, G::Int, λ=identity;
               initβ=zeros32, initγ=ones32,
-              affine=true, track_stats=false, active::Union{Bool,Nothing}=nothing,
-              ϵ=1f-5, momentum=0.1f0)
+              affine::Bool=true, track_stats::Bool=false, active::Union{Bool,Nothing}=nothing,
+              eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
 
-if track_stats
+  if track_stats
   Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", :GroupNorm)
-end
+  end
+  ε = _greek_ascii_depwarn(ϵ => eps, :GroupNorm, "ϵ" => "eps")
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
@@ -525,7 +532,7 @@ end
   return GroupNorm(G, λ,
             β, γ,
             μ, σ²,
-            ϵ, momentum,
+            ε, momentum,
             affine, track_stats,
             active, chs)
 end
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
@@ -4,7 +4,7 @@ using Statistics
 using Zygote
 using Zygote: @adjoint
 using ChainRulesCore
-using ..Flux: ofeltype, epseltype
+using ..Flux: ofeltype, epseltype, _greek_ascii_depwarn
 using CUDA
 using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss
 import Base.Broadcast: broadcasted
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
diff --git a/test/losses.jl b/test/losses.jl