From ac34df9cbdb376a413696f3e638c7efa41b1d0aa Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 4 Oct 2022 23:15:34 -0400 Subject: [PATCH 01/10] autosize, take 1 --- src/outputsize.jl | 184 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/src/outputsize.jl b/src/outputsize.jl index 774b75ff26..432f43f929 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -168,3 +168,187 @@ for (fn, Dims) in ((:conv, DenseConvDims),) end end end + + +export @autosize + +""" + @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...) + +Returns the specified model, with each `_` replaced by an inferred number, +for input of the given size. + +The unknown sizes are always the second-last dimension (or the length of a vector), +of that layer's input, which Flux usually regards as the channel dimension. +The underscore may appear as an argument of a layer, or inside a `=>`. + +# Examples +``` +julia> @autosize (3,) Chain(Dense(_ => 2, sigmoid), Flux.Scale(_)) +Chain( + Dense(3 => 2, σ), # 8 parameters + Scale(2), # 4 parameters +) # Total: 4 arrays, 12 parameters, 304 bytes. + +julia> img = [28, 28]; + +julia> @autosize (img..., 1, 32) Chain( # size is only needed at runtime + Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()), + p = MeanPool((3,3)), + b = BatchNorm(_), + f = Flux.flatten), + Dense(_ => _÷4, relu, init=Flux.rand32), # can calculate output size _÷4 + SkipConnection(Dense(_ => _, relu), +), + Dense(_ => 10), + ) |> gpu # moves to GPU after initialisation +Chain( + Chain( + c = Conv((3, 3), 1 => 5, pad=1, stride=2), # 50 parameters + p = MeanPool((3, 3)), + b = BatchNorm(5), # 10 parameters, plus 10 + f = Flux.flatten, + ), + Dense(80 => 20, relu), # 1_620 parameters + SkipConnection( + Dense(20 => 20, relu), # 420 parameters + +, + ), + Dense(20 => 10), # 210 parameters +) # Total: 10 trainable arrays, 2_310 parameters, + # plus 2 non-trainable, 10 parameters, summarysize 10.469 KiB. + +julia> outputsize(ans, (28, 28, 1, 32)) +(10, 32) +``` + +Limitations: +* Won't work yet for Bilinear, except like `@autosize (5, 32) Flux.Bilinear(_ => 7)` +* Beyond a matrix it gets Dense wrong, e.g. `@autosize (2, 3, 4) Dense(_ => 5)` +* `LayerNorm(_,_)` probably won't work, needs first few input dims. +* RNN: `@autosize (7, 11) LSTM(_ => 5)` fails, but `outputsize(RNN(3=>7), (3,))` also fails. +""" +macro autosize(size, model) + Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input") + Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)") + ex = makelazy(model) + @gensym m + quote + $m = $ex + $outputsize($m, $size) + $striplazy($m) + end |> esc +end + +function makelazy(ex::Expr) + n = underscoredepth(ex) + n == 0 && return ex + n == 1 && error("@autosize doesn't expect an underscore here: $ex") + n == 2 && return :($LazyLayer($(string(ex)), $(makefun(ex)), nothing)) + n > 2 && return Expr(ex.head, ex.args[1], map(makelazy, ex.args[2:end])...) +end +makelazy(x) = x + +function underscoredepth(ex::Expr) + # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10 + ex.head in (:call, :kw, :(->), :block) || return 0 + ex.args[1] == :(=>) && ex.args[2] == :_ && return 1 + m = maximum(underscoredepth, ex.args) + m == 0 ? 0 : m+1 +end +underscoredepth(ex) = Int(ex == :_) + +#= + +@autosize (3,) Chain(one = Dense(_ => 10)) # needs kw +@autosize (10,) Maxout(() -> Dense(_ => 7, tanh), 3) # needs ->, block + +=# + +function makefun(ex) + @gensym s + Expr(:(->), s, replaceunderscore(ex, s)) +end + +replaceunderscore(e, s) = e == :_ ? s : e +replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> replaceunderscore(a, s), ex.args)...) + +mutable struct LazyLayer + str::String + make::Function + layer +end + +function (l::LazyLayer)(x::AbstractArray) + if l.layer != nothing + return l.layer(x) + end + # s = channelsize(x) + s = size(x, max(1, ndims(x)-1)) + lay = l.make(s) + y = try + lay(x) + catch e + @error l.str + return nothing + end + l.layer = striplazy(lay) # is this a good idea? + return y +end + +#= + +Flux.outputsize(Chain(Dense(2=>3)), (4,)) # nice error +Flux.outputsize(Dense(2=>3), (4,)) # no nice error +@autosize (4,) Dense(2=>3) # no nice error + +@autosize (3,) Dense(2 => _) # shouldn't work, weird error + + +@autosize (3,5,6) LayerNorm(_,_) # no complaint, but +ans(rand(3,5,6)) # this fails + + + +``` +julia> Flux.outputsize(LayerNorm(2), (3,)) +(3,) + +julia> LayerNorm(2)(rand(Float32, 3)) +ERROR: DimensionMismatch: arrays could not be broadcast to a common size; got a dimension with lengths 2 and 3 + +julia> BatchNorm(2)(fill(Flux.nil, 3)) |> size +(3,) + +julia> BatchNorm(2)(rand(3)) +ERROR: arraysize: dimension out of range +``` + + +=# + +# channelsize(x) = size(x, max(1, ndims(x)-1)) + +using Functors: functor, @functor + +@functor LazyLayer # (layer,) + +function striplazy(x) + fs, re = functor(x) + re(map(striplazy, fs)) +end +striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer + +# Could make LazyLayer usable outside of @autosize +# For instance allow @lazy + +function Base.show(io::IO, l::LazyLayer) + printstyled(io, "LazyLayer(", color=:light_black) + if l.layer == nothing + printstyled(io, l.str, color=:red) + else + printstyled(io, l.layer, color=:green) + end + printstyled(io, ")", color=:light_black) +end + +_big_show(io::IO, l::LazyLayer, indent::Int=0, name=nothing) = _layer_show(io, l, indent, name) From 604f2b4432f62329dad528619c60a7f88c1bbe8d Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 4 Oct 2022 23:15:50 -0400 Subject: [PATCH 02/10] fix outputsize on LayerNorm --- src/outputsize.jl | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/outputsize.jl b/src/outputsize.jl index 432f43f929..2110262438 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -147,8 +147,30 @@ outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chai ## bypass statistics in normalization layers -for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm) - @eval (l::$layer)(x::AbstractArray{Nil}) = x +# for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm) +# @eval (l::$layer)(x::AbstractArray{Nil}) = x +# end +for layer in (:BatchNorm, :InstanceNorm, :GroupNorm) + @eval function (l::$layer)(x::AbstractArray{Nil}) + l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch( + string($layer, " expected ", l.chs, " channels, but got ", _channelsize(x)))) + x + end +end + +_channelsize(x::AbstractArray) = size(x, ndims(x)-1) +_channelsize(x::AbstractVector) = size(x, 1) + +function (l::LayerNorm)(x::AbstractArray{Nil,N}) where N + l.affine || return x + n = length(l.size) + l.size[1:min(n,N)] == size(x)[1:min(n,N)] || throw(DimensionMismatch( + string("LayerNorm expected size of input starting with ", l.size, ", but got size(x) == ", size(x)))) + if n <= N + return x + else + return similar(x, l.size) + end end ## fixes for layers that don't work out of the box From 46e06c73685908036363caa75dbcc18e168f8a96 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Wed, 5 Oct 2022 00:44:24 -0400 Subject: [PATCH 03/10] tidy & improve --- src/Flux.jl | 1 + src/outputsize.jl | 140 ++++++++++++++++++---------------------------- 2 files changed, 56 insertions(+), 85 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index b7d27406b0..a9df19e8c5 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -55,6 +55,7 @@ include("layers/show.jl") include("loading.jl") include("outputsize.jl") +export @autosize include("data/Data.jl") using .Data diff --git a/src/outputsize.jl b/src/outputsize.jl index 2110262438..8711a6c0e8 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -147,32 +147,14 @@ outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chai ## bypass statistics in normalization layers -# for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm) -# @eval (l::$layer)(x::AbstractArray{Nil}) = x -# end -for layer in (:BatchNorm, :InstanceNorm, :GroupNorm) +for layer in (:BatchNorm, :InstanceNorm, :GroupNorm) # LayerNorm works fine @eval function (l::$layer)(x::AbstractArray{Nil}) l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch( - string($layer, " expected ", l.chs, " channels, but got ", _channelsize(x)))) + string($layer, " expected ", l.chs, " channels, but got size(x) == ", size(x)))) x end end -_channelsize(x::AbstractArray) = size(x, ndims(x)-1) -_channelsize(x::AbstractVector) = size(x, 1) - -function (l::LayerNorm)(x::AbstractArray{Nil,N}) where N - l.affine || return x - n = length(l.size) - l.size[1:min(n,N)] == size(x)[1:min(n,N)] || throw(DimensionMismatch( - string("LayerNorm expected size of input starting with ", l.size, ", but got size(x) == ", size(x)))) - if n <= N - return x - else - return similar(x, l.size) - end -end - ## fixes for layers that don't work out of the box for (fn, Dims) in ((:conv, DenseConvDims),) @@ -192,25 +174,25 @@ for (fn, Dims) in ((:conv, DenseConvDims),) end -export @autosize - """ @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...) Returns the specified model, with each `_` replaced by an inferred number, -for input of the given size. +for input of the given `size`. -The unknown sizes are always the second-last dimension (or the length of a vector), -of that layer's input, which Flux usually regards as the channel dimension. +The unknown sizes are usually the second-last dimension of that layer's input, +which Flux regards as the channel dimension. +(A few layers, `Dense` & [`LayerNorm`](@ref), instead always use the first dimension.) The underscore may appear as an argument of a layer, or inside a `=>`. +It may be used in further calculations, such as `Dense(_ => _÷4)`. # Examples ``` -julia> @autosize (3,) Chain(Dense(_ => 2, sigmoid), Flux.Scale(_)) +julia> @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false)) Chain( Dense(3 => 2, σ), # 8 parameters - Scale(2), # 4 parameters -) # Total: 4 arrays, 12 parameters, 304 bytes. + BatchNorm(2, affine=false), +) julia> img = [28, 28]; @@ -244,15 +226,15 @@ julia> outputsize(ans, (28, 28, 1, 32)) ``` Limitations: -* Won't work yet for Bilinear, except like `@autosize (5, 32) Flux.Bilinear(_ => 7)` -* Beyond a matrix it gets Dense wrong, e.g. `@autosize (2, 3, 4) Dense(_ => 5)` -* `LayerNorm(_,_)` probably won't work, needs first few input dims. -* RNN: `@autosize (7, 11) LSTM(_ => 5)` fails, but `outputsize(RNN(3=>7), (3,))` also fails. +* While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail. +* While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)` + will fail if `size(x,1) != size(x,2)`. +* RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue. """ macro autosize(size, model) Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input") Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)") - ex = makelazy(model) + ex = _makelazy(model) @gensym m quote $m = $ex @@ -261,38 +243,56 @@ macro autosize(size, model) end |> esc end -function makelazy(ex::Expr) - n = underscoredepth(ex) +function _makelazy(ex::Expr) + n = _underscoredepth(ex) n == 0 && return ex n == 1 && error("@autosize doesn't expect an underscore here: $ex") - n == 2 && return :($LazyLayer($(string(ex)), $(makefun(ex)), nothing)) - n > 2 && return Expr(ex.head, ex.args[1], map(makelazy, ex.args[2:end])...) + n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing)) + n > 2 && return Expr(ex.head, ex.args[1], map(_makelazy, ex.args[2:end])...) end -makelazy(x) = x +_makelazy(x) = x -function underscoredepth(ex::Expr) +function _underscoredepth(ex::Expr) # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10 ex.head in (:call, :kw, :(->), :block) || return 0 ex.args[1] == :(=>) && ex.args[2] == :_ && return 1 - m = maximum(underscoredepth, ex.args) + m = maximum(_underscoredepth, ex.args) m == 0 ? 0 : m+1 end -underscoredepth(ex) = Int(ex == :_) +_underscoredepth(ex) = Int(ex == :_) #= -@autosize (3,) Chain(one = Dense(_ => 10)) # needs kw -@autosize (10,) Maxout(() -> Dense(_ => 7, tanh), 3) # needs ->, block +@autosize (3,) Chain(one = Dense(_ => 4), two = softmax) # needs kw +@autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2) # needs ->, block + +# here Parallel gets two inputs, no problem: +@autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_)) =# -function makefun(ex) - @gensym s - Expr(:(->), s, replaceunderscore(ex, s)) +function _makefun(ex) + T = Meta.isexpr(ex, :call) ? ex.args[1] : Type + @gensym x s + Expr(:(->), x, Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s))) end -replaceunderscore(e, s) = e == :_ ? s : e -replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> replaceunderscore(a, s), ex.args)...) +""" + autosizefor(::Type, x) + +If an `_` in your layer's constructor, used within `@autosize`, should +*not* mean the 2nd-last dimension, then you can overload this. + +For instance `autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)` +is needed to make `@autosize (2,3,4) Dense(_ => 5)` return +`Dense(2 => 5)` rather than `Dense(3 => 5)`. +""" +autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1)) +autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1) +autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1) + +_replaceunderscore(e, s) = e == :_ ? s : e +_replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...) mutable struct LazyLayer str::String @@ -301,19 +301,10 @@ mutable struct LazyLayer end function (l::LazyLayer)(x::AbstractArray) - if l.layer != nothing - return l.layer(x) - end - # s = channelsize(x) - s = size(x, max(1, ndims(x)-1)) - lay = l.make(s) - y = try - lay(x) - catch e - @error l.str - return nothing - end - l.layer = striplazy(lay) # is this a good idea? + l.layer == nothing || return l.layer(x) + lay = l.make(x) + y = lay(x) + l.layer = lay # mutate after we know that call worked return y end @@ -329,30 +320,9 @@ Flux.outputsize(Dense(2=>3), (4,)) # no nice error @autosize (3,5,6) LayerNorm(_,_) # no complaint, but ans(rand(3,5,6)) # this fails - - -``` -julia> Flux.outputsize(LayerNorm(2), (3,)) -(3,) - -julia> LayerNorm(2)(rand(Float32, 3)) -ERROR: DimensionMismatch: arrays could not be broadcast to a common size; got a dimension with lengths 2 and 3 - -julia> BatchNorm(2)(fill(Flux.nil, 3)) |> size -(3,) - -julia> BatchNorm(2)(rand(3)) -ERROR: arraysize: dimension out of range -``` - - =# -# channelsize(x) = size(x, max(1, ndims(x)-1)) - -using Functors: functor, @functor - -@functor LazyLayer # (layer,) +@functor LazyLayer function striplazy(x) fs, re = functor(x) @@ -360,8 +330,8 @@ function striplazy(x) end striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer -# Could make LazyLayer usable outside of @autosize -# For instance allow @lazy +# Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))? +# But then it will survive to produce weird structural gradients etc. function Base.show(io::IO, l::LazyLayer) printstyled(io, "LazyLayer(", color=:light_black) From 310b71eaed17434bd917bee8d6a26cb7a37be60b Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Wed, 5 Oct 2022 14:07:00 -0400 Subject: [PATCH 04/10] add tests, release note --- NEWS.md | 3 +++ src/outputsize.jl | 10 ------- test/outputsize.jl | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9bf97ddb3b..d83e76d62a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # Flux Release Notes +## v0.13.7 +* Added [`@autosize` macro](https://github.com/FluxML/Flux.jl/pull/2078) + ## v0.13.4 * Added [`PairwiseFusion` layer](https://github.com/FluxML/Flux.jl/pull/1983) diff --git a/src/outputsize.jl b/src/outputsize.jl index 8711a6c0e8..6caff3035f 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -261,16 +261,6 @@ function _underscoredepth(ex::Expr) end _underscoredepth(ex) = Int(ex == :_) -#= - -@autosize (3,) Chain(one = Dense(_ => 4), two = softmax) # needs kw -@autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2) # needs ->, block - -# here Parallel gets two inputs, no problem: -@autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_)) - -=# - function _makefun(ex) T = Meta.isexpr(ex, :call) ? ex.args[1] : Type @gensym x s diff --git a/test/outputsize.jl b/test/outputsize.jl index 667b5bad76..2084b26369 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -142,16 +142,81 @@ end m = LayerNorm(32) @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) + m2 = LayerNorm(3, 2) + @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2))) + @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2))) m = BatchNorm(3) @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) + @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) m = InstanceNorm(3) @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) + @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) m = GroupNorm(16, 4) @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16) @test outputsize(m, (32, 32, 16); padbatch=true) == (32, 32, 16, 1) + @test_throws Exception m(randn(Float32, 32, 32, 15, 4)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4)) end + +@testset "autosize macro" begin + m = @autosize (3,) Dense(_ => 4) + @test randn(3) |> m |> size == (4,) + + m = @autosize (3, 1) Chain(Dense(_ => 4), Dense(4 => 10), softmax) + @test randn(3, 5) |> m |> size == (10, 5) + + m = @autosize (2, 3, 4, 5) Dense(_ => 10) # goes by first dim, not 2nd-last + @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5) + + m = @autosize (9,) Dense(_ => div(_,2)) + @test randn(9) |> m |> size == (4,) + + m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax) # needs kw + @test randn(3) |> m |> size == (4,) + + m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2) # needs ->, block + @test randn(3, 45) |> m |> size == (6, 45) + + # here Parallel gets two inputs, no problem: + m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_)) + @test randn(3) |> m |> size == (11,) + + # like Dense, LayerNorm goes by the first dimension: + m = @autosize (3, 4, 5) LayerNorm(_) + @test rand(3, 6, 7) |> m |> size == (3, 6, 7) + + m = @autosize (3, 3, 10) LayerNorm(_, _) # does not check that sizes match + @test rand(3, 3, 10) |> m |> size == (3, 3, 10) + + m = @autosize (3,) Flux.Bilinear(_ => 10) + @test randn(3) |> m |> size == (10,) + + m = @autosize (3, 1) Flux.Bilinear(_ => 10) + @test randn(3, 4) |> m |> size == (10, 4) + + @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10) + + # first docstring example + m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false)) + @test randn(3, 4) |> m |> size == (2, 4) + + # evil docstring example + img = [28, 28]; + m = @autosize (img..., 1, 32) Chain( # size is only needed at runtime + Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()), + p = MeanPool((3,3)), + b = BatchNorm(_), + f = Flux.flatten), + Dense(_ => _÷4, relu, init=Flux.rand32), # can calculate output size _÷4 + SkipConnection(Dense(_ => _, relu), +), + Dense(_ => 10), + ) |> gpu # moves to GPU after initialisation + @test randn(Float32, img..., 1, 32) |> gpu |> m |> size == (10, 32) +end \ No newline at end of file From 537b0118b09bb3751b2e62828a3b76fc8260c34e Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Thu, 6 Oct 2022 21:08:04 -0400 Subject: [PATCH 05/10] rrule errors, improvements, tests --- src/outputsize.jl | 55 ++++++++++++++++++++++------------------------ test/outputsize.jl | 26 ++++++++++++++++++++-- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/outputsize.jl b/src/outputsize.jl index 6caff3035f..40170ad8d2 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -255,11 +255,11 @@ _makelazy(x) = x function _underscoredepth(ex::Expr) # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10 ex.head in (:call, :kw, :(->), :block) || return 0 - ex.args[1] == :(=>) && ex.args[2] == :_ && return 1 + ex.args[1] === :(=>) && ex.args[2] === :_ && return 1 m = maximum(_underscoredepth, ex.args) m == 0 ? 0 : m+1 end -_underscoredepth(ex) = Int(ex == :_) +_underscoredepth(ex) = Int(ex === :_) function _makefun(ex) T = Meta.isexpr(ex, :call) ? ex.args[1] : Type @@ -281,7 +281,7 @@ autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1)) autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1) autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1) -_replaceunderscore(e, s) = e == :_ ? s : e +_replaceunderscore(e, s) = e === :_ ? s : e _replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...) mutable struct LazyLayer @@ -290,45 +290,42 @@ mutable struct LazyLayer layer end -function (l::LazyLayer)(x::AbstractArray) - l.layer == nothing || return l.layer(x) - lay = l.make(x) - y = lay(x) - l.layer = lay # mutate after we know that call worked +@functor LazyLayer + +function (l::LazyLayer)(x::AbstractArray, ys::AbstractArray...) + l.layer === nothing || return l.layer(x, ys...) + made = l.make(x) # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later. + y = made(x, ys...) + l.layer = made # mutate after we know that call worked return y end -#= - -Flux.outputsize(Chain(Dense(2=>3)), (4,)) # nice error -Flux.outputsize(Dense(2=>3), (4,)) # no nice error -@autosize (4,) Dense(2=>3) # no nice error - -@autosize (3,) Dense(2 => _) # shouldn't work, weird error - - -@autosize (3,5,6) LayerNorm(_,_) # no complaint, but -ans(rand(3,5,6)) # this fails - -=# - -@functor LazyLayer - -function striplazy(x) - fs, re = functor(x) +function striplazy(m) + fs, re = functor(m) re(map(striplazy, fs)) end -striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer +function striplazy(l::LazyLayer) + l.layer === nothing || return l.layer + error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy") +end # Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))? # But then it will survive to produce weird structural gradients etc. +function ChainRulesCore.rrule(l::LazyLayer, x) + l(x), _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.") +end +function ChainRulesCore.rrule(::typeof(striplazy), m) + striplazy(m), _ -> error("striplazy should never be used within a gradient") +end + +params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.") function Base.show(io::IO, l::LazyLayer) printstyled(io, "LazyLayer(", color=:light_black) if l.layer == nothing - printstyled(io, l.str, color=:red) + printstyled(io, l.str, color=:magenta) else - printstyled(io, l.layer, color=:green) + printstyled(io, l.layer, color=:cyan) end printstyled(io, ")", color=:light_black) end diff --git a/test/outputsize.jl b/test/outputsize.jl index 2084b26369..2e9595f699 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -169,7 +169,7 @@ end m = @autosize (3,) Dense(_ => 4) @test randn(3) |> m |> size == (4,) - m = @autosize (3, 1) Chain(Dense(_ => 4), Dense(4 => 10), softmax) + m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax) @test randn(3, 5) |> m |> size == (10, 5) m = @autosize (2, 3, 4, 5) Dense(_ => 10) # goes by first dim, not 2nd-last @@ -201,6 +201,9 @@ end m = @autosize (3, 1) Flux.Bilinear(_ => 10) @test randn(3, 4) |> m |> size == (10, 4) + m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10)) # Bilinear gets two inputs + @test randn(3, 4) |> m |> size == (10, 4) + @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10) # first docstring example @@ -219,4 +222,23 @@ end Dense(_ => 10), ) |> gpu # moves to GPU after initialisation @test randn(Float32, img..., 1, 32) |> gpu |> m |> size == (10, 32) -end \ No newline at end of file +end + +@testset "LazyLayer" begin + # This is what `@autosize` uses, ideally nobody should make these by hand! + # Implicitly testeed by the macro, explicitly here too: + ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)", x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init=ones), nothing) + + lm = Chain(ld, Flux.Scale(3)) + @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))" + @test_throws Exception Flux.striplazy(lm) + + @test lm([1,2]) == [3,3,3] + + @test string(ld) == "LazyLayer(Dense(2 => 3, relu))" + @test Flux.striplazy(ld) isa Dense + + @test_throws Exception Flux.params(lm) + @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2]) + @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld) +end From b2016e20430d3295518faad1d8120d85fec44a73 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Thu, 6 Oct 2022 23:27:22 -0400 Subject: [PATCH 06/10] documentation --- docs/src/outputsize.md | 85 +++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md index d692816b46..e69f4c9d7e 100644 --- a/docs/src/outputsize.md +++ b/docs/src/outputsize.md @@ -1,47 +1,72 @@ # Shape Inference -To help you generate models in an automated fashion, [`Flux.outputsize`](@ref) lets you -calculate the size returned produced by layers for a given size input. -This is especially useful for layers like [`Conv`](@ref). +Flux has some tools to help generate models in an automated fashion, by inferring the size +of arrays that layers will recieve, without doing any computation. +This is especially useful for convolutional models, where the same [`Conv`](@ref) layer +accepts any size of image, but the next layer may not. -It works by passing a "dummy" array into the model that preserves size information without running any computation. -`outputsize(f, inputsize)` works for all layers (including custom layers) out of the box. -By default, `inputsize` expects the batch dimension, -but you can exclude the batch size with `outputsize(f, inputsize; padbatch=true)` (assuming it to be one). +The higher-level one is a macro [`@autosize`](@ref) which acts on the code defining the layers, +and replaces each appearance of `_` with the relevant size. A simple example might be: -Using this utility function lets you automate model building for various inputs like so: ```julia -""" - make_model(width, height, inchannels, nclasses; - layer_config = [16, 16, 32, 32, 64, 64]) +@autosize (28, 28, 1, 32) Chain(Conv((3, 3), _ => 5, relu, stride=2), Flux.flatten, Dense(_ => 10)) +``` + +The size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the +layer constructors containing `_` must be explicitly written out -- the macro sees the code as written. -Create a CNN for a given set of configuration parameters. +This relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly: -# Arguments -- `width`: the input image width -- `height`: the input image height -- `inchannels`: the number of channels in the input image -- `nclasses`: the number of output classes -- `layer_config`: a vector of the number of filters per each conv layer +```julia +c = Conv((3, 3), 1 => 5, relu, stride=2) +Flux.outputsize(c, (28, 28, 1, 32)) # returns (13, 13, 5, 32) +``` + +The function `outputsize` works by passing a "dummy" array into the model, which propagates through very cheaply. +It should work for all layers, including custom layers, out of the box. + +An example of how to automate model building is this: +```julia """ -function make_model(width, height, inchannels, nclasses; - layer_config = [16, 16, 32, 32, 64, 64]) - # construct a vector of conv layers programmatically - conv_layers = [Conv((3, 3), inchannels => layer_config[1])] - for (infilters, outfilters) in zip(layer_config, layer_config[2:end]) - push!(conv_layers, Conv((3, 3), infilters => outfilters)) + make_model(width, height, [inchannels, nclasses; layer_config]) + +Create a CNN for a given set of configuration parameters. Arguments: +- `width`, `height`: the input image size in pixels +- `inchannels`: the number of channels in the input image, default `1` +- `nclasses`: the number of output classes, default `10` +- Keyword `layer_config`: a vector of the number of filters per layer, default `[16, 16, 32, 64]` +""" +function make_model(width, height, inchannels = 1, nclasses = 10; + layer_config = [16, 16, 32, 64]) + # construct a vector of layers: + conv_layers = [] + push!(conv_layers, Conv((5, 5), inchannels => layer_config[1], relu, pad=SamePad())) + for (inch, outch) in zip(layer_config, layer_config[2:end]) + push!(conv_layers, Conv((3, 3), inch => outch, sigmoid, stride=2)) end - # compute the output dimensions for the conv layers - # use padbatch=true to set the batch dimension to 1 - conv_outsize = Flux.outputsize(conv_layers, (width, height, nchannels); padbatch=true) + # compute the output dimensions after these conv layers: + conv_outsize = Flux.outputsize(conv_layers, (width, height, inchannels); padbatch=true) + + # use this to define appropriate Dense layer: + last_layer = Dense(prod(conv_outsize) => nclasses) + return Chain(conv_layers..., Flux.flatten, last_layer) +end + +make_model(28, 28, 3, layer_config = [8, 17, 33, 65]) +``` + +Alternatively, using the macro, the definition of `make_model` could end with: - # the input dimension to Dense is programatically calculated from - # width, height, and nchannels - return Chain(conv_layers..., Dense(prod(conv_outsize) => nclasses)) +``` + # compute the output dimensions & construct appropriate Dense layer: + return @autosize (width, height, inchannels, 1) Chain(conv_layers..., Flux.flatten, Dense(_ => nclasses)) end ``` +### Listing + ```@docs +Flux.@autosize Flux.outputsize ``` From e2ab1ece4ea1ca1ba39188d44df96aefea3d9018 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 7 Oct 2022 09:50:03 -0400 Subject: [PATCH 07/10] tweaks --- docs/src/outputsize.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md index e69f4c9d7e..4e41b51648 100644 --- a/docs/src/outputsize.md +++ b/docs/src/outputsize.md @@ -5,17 +5,18 @@ of arrays that layers will recieve, without doing any computation. This is especially useful for convolutional models, where the same [`Conv`](@ref) layer accepts any size of image, but the next layer may not. -The higher-level one is a macro [`@autosize`](@ref) which acts on the code defining the layers, -and replaces each appearance of `_` with the relevant size. A simple example might be: +The higher-level tool is a macro [`@autosize`](@ref) which acts on the code defining the layers, +and replaces each appearance of `_` with the relevant size. This simple example returns a model +with `Dense(845 => 10)` as the last layer: ```julia @autosize (28, 28, 1, 32) Chain(Conv((3, 3), _ => 5, relu, stride=2), Flux.flatten, Dense(_ => 10)) ``` -The size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the +The input size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the layer constructors containing `_` must be explicitly written out -- the macro sees the code as written. -This relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly: +This macro relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly: ```julia c = Conv((3, 3), 1 => 5, relu, stride=2) @@ -53,7 +54,9 @@ function make_model(width, height, inchannels = 1, nclasses = 10; return Chain(conv_layers..., Flux.flatten, last_layer) end -make_model(28, 28, 3, layer_config = [8, 17, 33, 65]) +m = make_model(28, 28, 3, layer_config = [9, 17, 33, 65]) + +Flux.outputsize(m, (28, 28, 3, 42)) == (10, 42) == size(m(randn(Float32, 28, 28, 3, 42))) ``` Alternatively, using the macro, the definition of `make_model` could end with: From 67ea6a7ce575878b40f088666da676c53814ff89 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 8 Oct 2022 20:33:58 -0400 Subject: [PATCH 08/10] add jldoctest; output = false --- docs/src/outputsize.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md index 4e41b51648..768815c14e 100644 --- a/docs/src/outputsize.md +++ b/docs/src/outputsize.md @@ -27,7 +27,7 @@ The function `outputsize` works by passing a "dummy" array into the model, which It should work for all layers, including custom layers, out of the box. An example of how to automate model building is this: -```julia +```jldoctest; output = false """ make_model(width, height, [inchannels, nclasses; layer_config]) @@ -57,6 +57,10 @@ end m = make_model(28, 28, 3, layer_config = [9, 17, 33, 65]) Flux.outputsize(m, (28, 28, 3, 42)) == (10, 42) == size(m(randn(Float32, 28, 28, 3, 42))) + +# output + +true ``` Alternatively, using the macro, the definition of `make_model` could end with: From 936bb5b0e91567f4398f644a41a1bb739fb50688 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 10 Oct 2022 09:07:35 -0400 Subject: [PATCH 09/10] tweak --- docs/src/outputsize.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md index 768815c14e..037a90d6aa 100644 --- a/docs/src/outputsize.md +++ b/docs/src/outputsize.md @@ -35,7 +35,7 @@ Create a CNN for a given set of configuration parameters. Arguments: - `width`, `height`: the input image size in pixels - `inchannels`: the number of channels in the input image, default `1` - `nclasses`: the number of output classes, default `10` -- Keyword `layer_config`: a vector of the number of filters per layer, default `[16, 16, 32, 64]` +- Keyword `layer_config`: a vector of the number of channels per layer, default `[16, 16, 32, 64]` """ function make_model(width, height, inchannels = 1, nclasses = 10; layer_config = [16, 16, 32, 64]) From 5c1ed685351ee7a10b9e73bbc1e914b47a160fb9 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 10 Oct 2022 11:15:58 -0400 Subject: [PATCH 10/10] using Flux --- docs/src/outputsize.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md index 037a90d6aa..9376db9ab8 100644 --- a/docs/src/outputsize.md +++ b/docs/src/outputsize.md @@ -27,7 +27,7 @@ The function `outputsize` works by passing a "dummy" array into the model, which It should work for all layers, including custom layers, out of the box. An example of how to automate model building is this: -```jldoctest; output = false +```jldoctest; output = false, setup = :(using Flux) """ make_model(width, height, [inchannels, nclasses; layer_config])