From ac34df9cbdb376a413696f3e638c7efa41b1d0aa Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 4 Oct 2022 23:15:34 -0400
Subject: [PATCH 01/10] autosize, take 1

---
 src/outputsize.jl | 184 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/src/outputsize.jl b/src/outputsize.jl
index 774b75ff26..432f43f929 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -168,3 +168,187 @@ for (fn, Dims) in ((:conv, DenseConvDims),)
     end
   end
 end
+
+
+export @autosize
+
+"""
+    @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...)
+
+Returns the specified model, with each `_` replaced by an inferred number,
+for input of the given size.
+
+The unknown sizes are always the second-last dimension (or the length of a vector),
+of that layer's input, which Flux usually regards as the channel dimension.
+The underscore may appear as an argument of a layer, or inside a `=>`.
+
+# Examples
+```
+julia> @autosize (3,) Chain(Dense(_ => 2, sigmoid), Flux.Scale(_))
+Chain(
+  Dense(3 => 2, σ),                     # 8 parameters
+  Scale(2),                             # 4 parameters
+)                   # Total: 4 arrays, 12 parameters, 304 bytes.
+
+julia> img = [28, 28];
+
+julia> @autosize (img..., 1, 32) Chain(              # size is only needed at runtime
+          Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()),
+                p = MeanPool((3,3)),
+                b = BatchNorm(_),
+                f = Flux.flatten),
+          Dense(_ => _÷4, relu, init=Flux.rand32),   # can calculate output size _÷4
+          SkipConnection(Dense(_ => _, relu), +),
+          Dense(_ => 10),
+       ) |> gpu                                      # moves to GPU after initialisation
+Chain(
+  Chain(
+    c = Conv((3, 3), 1 => 5, pad=1, stride=2),  # 50 parameters
+    p = MeanPool((3, 3)),
+    b = BatchNorm(5),                   # 10 parameters, plus 10
+    f = Flux.flatten,
+  ),
+  Dense(80 => 20, relu),                # 1_620 parameters
+  SkipConnection(
+    Dense(20 => 20, relu),              # 420 parameters
+    +,
+  ),
+  Dense(20 => 10),                      # 210 parameters
+)         # Total: 10 trainable arrays, 2_310 parameters,
+          # plus 2 non-trainable, 10 parameters, summarysize 10.469 KiB.
+
+julia> outputsize(ans, (28, 28, 1, 32))
+(10, 32)
+```
+
+Limitations:
+* Won't work yet for Bilinear, except like `@autosize (5, 32) Flux.Bilinear(_ => 7)`
+* Beyond a matrix it gets Dense wrong, e.g. `@autosize (2, 3, 4) Dense(_ => 5)`
+* `LayerNorm(_,_)` probably won't work, needs first few input dims.
+* RNN: `@autosize (7, 11) LSTM(_ => 5)` fails, but `outputsize(RNN(3=>7), (3,))` also fails.
+"""
+macro autosize(size, model)
+  Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input")
+  Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)")
+  ex = makelazy(model)
+  @gensym m
+  quote
+    $m = $ex
+    $outputsize($m, $size)
+    $striplazy($m)
+  end |> esc
+end
+
+function makelazy(ex::Expr)
+  n = underscoredepth(ex)
+  n == 0 && return ex
+  n == 1 && error("@autosize doesn't expect an underscore here: $ex")
+  n == 2 && return :($LazyLayer($(string(ex)), $(makefun(ex)), nothing))
+  n > 2 && return Expr(ex.head, ex.args[1], map(makelazy, ex.args[2:end])...)
+end
+makelazy(x) = x
+
+function underscoredepth(ex::Expr)
+  # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10
+  ex.head in (:call, :kw, :(->), :block) || return 0
+  ex.args[1] == :(=>) && ex.args[2] == :_ && return 1
+  m = maximum(underscoredepth, ex.args)
+  m == 0 ? 0 : m+1
+end
+underscoredepth(ex) = Int(ex == :_)
+
+#=
+
+@autosize (3,) Chain(one = Dense(_ => 10))  # needs kw
+@autosize (10,) Maxout(() -> Dense(_ => 7, tanh), 3)  # needs ->, block
+
+=#
+
+function makefun(ex)
+  @gensym s
+  Expr(:(->), s, replaceunderscore(ex, s))
+end
+
+replaceunderscore(e, s) = e == :_ ? s : e
+replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> replaceunderscore(a, s), ex.args)...)
+
+mutable struct LazyLayer
+  str::String
+  make::Function
+  layer
+end
+
+function (l::LazyLayer)(x::AbstractArray)
+  if l.layer != nothing
+    return l.layer(x)
+  end
+  # s = channelsize(x)
+  s = size(x, max(1, ndims(x)-1))
+  lay = l.make(s)
+  y = try
+    lay(x)
+  catch e
+    @error l.str
+    return nothing
+  end
+  l.layer = striplazy(lay)  # is this a good idea?
+  return y
+end
+
+#=
+
+Flux.outputsize(Chain(Dense(2=>3)), (4,))  # nice error
+Flux.outputsize(Dense(2=>3), (4,))  # no nice error
+@autosize (4,) Dense(2=>3)  # no nice error
+
+@autosize (3,) Dense(2 => _)  # shouldn't work, weird error
+
+
+@autosize (3,5,6) LayerNorm(_,_)  # no complaint, but
+ans(rand(3,5,6))  # this fails
+
+
+
+```
+julia> Flux.outputsize(LayerNorm(2), (3,))
+(3,)
+
+julia> LayerNorm(2)(rand(Float32, 3))
+ERROR: DimensionMismatch: arrays could not be broadcast to a common size; got a dimension with lengths 2 and 3
+
+julia> BatchNorm(2)(fill(Flux.nil, 3)) |> size
+(3,)
+
+julia> BatchNorm(2)(rand(3))
+ERROR: arraysize: dimension out of range
+```
+
+
+=#
+
+# channelsize(x) = size(x, max(1, ndims(x)-1))
+
+using Functors: functor, @functor
+
+@functor LazyLayer # (layer,)
+
+function striplazy(x)
+  fs, re = functor(x)
+  re(map(striplazy, fs))
+end
+striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer
+
+# Could make LazyLayer usable outside of @autosize
+# For instance allow @lazy
+
+function Base.show(io::IO, l::LazyLayer)
+  printstyled(io, "LazyLayer(", color=:light_black)
+  if l.layer == nothing
+    printstyled(io, l.str, color=:red)
+  else
+    printstyled(io, l.layer, color=:green)
+  end
+  printstyled(io, ")", color=:light_black)
+end
+
+_big_show(io::IO, l::LazyLayer, indent::Int=0, name=nothing) = _layer_show(io, l, indent, name)

From 604f2b4432f62329dad528619c60a7f88c1bbe8d Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 4 Oct 2022 23:15:50 -0400
Subject: [PATCH 02/10] fix outputsize on LayerNorm

---
 src/outputsize.jl | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/outputsize.jl b/src/outputsize.jl
index 432f43f929..2110262438 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -147,8 +147,30 @@ outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chai
 
 ## bypass statistics in normalization layers
 
-for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm)
-  @eval (l::$layer)(x::AbstractArray{Nil}) = x
+# for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm)
+#   @eval (l::$layer)(x::AbstractArray{Nil}) = x
+# end
+for layer in (:BatchNorm, :InstanceNorm, :GroupNorm)
+  @eval function (l::$layer)(x::AbstractArray{Nil})
+    l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch(
+      string($layer, " expected ", l.chs, " channels, but got ", _channelsize(x))))
+    x
+  end
+end
+
+_channelsize(x::AbstractArray) = size(x, ndims(x)-1)
+_channelsize(x::AbstractVector) = size(x, 1)
+
+function (l::LayerNorm)(x::AbstractArray{Nil,N}) where N
+  l.affine || return x
+  n = length(l.size)
+  l.size[1:min(n,N)] == size(x)[1:min(n,N)] || throw(DimensionMismatch(
+    string("LayerNorm expected size of input starting with ", l.size, ", but got size(x) == ", size(x))))
+  if n <= N
+    return x
+  else
+    return similar(x, l.size)
+  end
 end
 
 ## fixes for layers that don't work out of the box

From 46e06c73685908036363caa75dbcc18e168f8a96 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Wed, 5 Oct 2022 00:44:24 -0400
Subject: [PATCH 03/10] tidy & improve

---
 src/Flux.jl       |   1 +
 src/outputsize.jl | 140 ++++++++++++++++++----------------------------
 2 files changed, 56 insertions(+), 85 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index b7d27406b0..a9df19e8c5 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -55,6 +55,7 @@ include("layers/show.jl")
 include("loading.jl")
 
 include("outputsize.jl")
+export @autosize
 
 include("data/Data.jl")
 using .Data
diff --git a/src/outputsize.jl b/src/outputsize.jl
index 2110262438..8711a6c0e8 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -147,32 +147,14 @@ outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chai
 
 ## bypass statistics in normalization layers
 
-# for layer in (:LayerNorm, :BatchNorm, :InstanceNorm, :GroupNorm)
-#   @eval (l::$layer)(x::AbstractArray{Nil}) = x
-# end
-for layer in (:BatchNorm, :InstanceNorm, :GroupNorm)
+for layer in (:BatchNorm, :InstanceNorm, :GroupNorm)  # LayerNorm works fine
   @eval function (l::$layer)(x::AbstractArray{Nil})
     l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch(
-      string($layer, " expected ", l.chs, " channels, but got ", _channelsize(x))))
+      string($layer, " expected ", l.chs, " channels, but got size(x) == ", size(x))))
     x
   end
 end
 
-_channelsize(x::AbstractArray) = size(x, ndims(x)-1)
-_channelsize(x::AbstractVector) = size(x, 1)
-
-function (l::LayerNorm)(x::AbstractArray{Nil,N}) where N
-  l.affine || return x
-  n = length(l.size)
-  l.size[1:min(n,N)] == size(x)[1:min(n,N)] || throw(DimensionMismatch(
-    string("LayerNorm expected size of input starting with ", l.size, ", but got size(x) == ", size(x))))
-  if n <= N
-    return x
-  else
-    return similar(x, l.size)
-  end
-end
-
 ## fixes for layers that don't work out of the box
 
 for (fn, Dims) in ((:conv, DenseConvDims),)
@@ -192,25 +174,25 @@ for (fn, Dims) in ((:conv, DenseConvDims),)
 end
 
 
-export @autosize
-
 """
     @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...)
 
 Returns the specified model, with each `_` replaced by an inferred number,
-for input of the given size.
+for input of the given `size`.
 
-The unknown sizes are always the second-last dimension (or the length of a vector),
-of that layer's input, which Flux usually regards as the channel dimension.
+The unknown sizes are usually the second-last dimension of that layer's input,
+which Flux regards as the channel dimension.
+(A few layers, `Dense` & [`LayerNorm`](@ref), instead always use the first dimension.)
 The underscore may appear as an argument of a layer, or inside a `=>`.
+It may be used in further calculations, such as `Dense(_ => _÷4)`.
 
 # Examples
 ```
-julia> @autosize (3,) Chain(Dense(_ => 2, sigmoid), Flux.Scale(_))
+julia> @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false))
 Chain(
   Dense(3 => 2, σ),                     # 8 parameters
-  Scale(2),                             # 4 parameters
-)                   # Total: 4 arrays, 12 parameters, 304 bytes.
+  BatchNorm(2, affine=false),
+) 
 
 julia> img = [28, 28];
 
@@ -244,15 +226,15 @@ julia> outputsize(ans, (28, 28, 1, 32))
 ```
 
 Limitations:
-* Won't work yet for Bilinear, except like `@autosize (5, 32) Flux.Bilinear(_ => 7)`
-* Beyond a matrix it gets Dense wrong, e.g. `@autosize (2, 3, 4) Dense(_ => 5)`
-* `LayerNorm(_,_)` probably won't work, needs first few input dims.
-* RNN: `@autosize (7, 11) LSTM(_ => 5)` fails, but `outputsize(RNN(3=>7), (3,))` also fails.
+* While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail.
+* While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)`
+  will fail if `size(x,1) != size(x,2)`.
+* RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue.
 """
 macro autosize(size, model)
   Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input")
   Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)")
-  ex = makelazy(model)
+  ex = _makelazy(model)
   @gensym m
   quote
     $m = $ex
@@ -261,38 +243,56 @@ macro autosize(size, model)
   end |> esc
 end
 
-function makelazy(ex::Expr)
-  n = underscoredepth(ex)
+function _makelazy(ex::Expr)
+  n = _underscoredepth(ex)
   n == 0 && return ex
   n == 1 && error("@autosize doesn't expect an underscore here: $ex")
-  n == 2 && return :($LazyLayer($(string(ex)), $(makefun(ex)), nothing))
-  n > 2 && return Expr(ex.head, ex.args[1], map(makelazy, ex.args[2:end])...)
+  n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing))
+  n > 2 && return Expr(ex.head, ex.args[1], map(_makelazy, ex.args[2:end])...)
 end
-makelazy(x) = x
+_makelazy(x) = x
 
-function underscoredepth(ex::Expr)
+function _underscoredepth(ex::Expr)
   # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10
   ex.head in (:call, :kw, :(->), :block) || return 0
   ex.args[1] == :(=>) && ex.args[2] == :_ && return 1
-  m = maximum(underscoredepth, ex.args)
+  m = maximum(_underscoredepth, ex.args)
   m == 0 ? 0 : m+1
 end
-underscoredepth(ex) = Int(ex == :_)
+_underscoredepth(ex) = Int(ex == :_)
 
 #=
 
-@autosize (3,) Chain(one = Dense(_ => 10))  # needs kw
-@autosize (10,) Maxout(() -> Dense(_ => 7, tanh), 3)  # needs ->, block
+@autosize (3,) Chain(one = Dense(_ => 4), two = softmax)  # needs kw
+@autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2)    # needs ->, block
+
+# here Parallel gets two inputs, no problem:
+@autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_))
 
 =#
 
-function makefun(ex)
-  @gensym s
-  Expr(:(->), s, replaceunderscore(ex, s))
+function _makefun(ex)
+  T = Meta.isexpr(ex, :call) ? ex.args[1] : Type
+  @gensym x s
+  Expr(:(->), x, Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s)))
 end
 
-replaceunderscore(e, s) = e == :_ ? s : e
-replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> replaceunderscore(a, s), ex.args)...)
+"""
+    autosizefor(::Type, x)
+
+If an `_` in your layer's constructor, used within `@autosize`, should
+*not* mean the 2nd-last dimension, then you can overload this.
+
+For instance `autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)`
+is needed to make `@autosize (2,3,4) Dense(_ => 5)` return 
+`Dense(2 => 5)` rather than `Dense(3 => 5)`.
+"""
+autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1))
+autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)
+autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1)
+
+_replaceunderscore(e, s) = e == :_ ? s : e
+_replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...)
 
 mutable struct LazyLayer
   str::String
@@ -301,19 +301,10 @@ mutable struct LazyLayer
 end
 
 function (l::LazyLayer)(x::AbstractArray)
-  if l.layer != nothing
-    return l.layer(x)
-  end
-  # s = channelsize(x)
-  s = size(x, max(1, ndims(x)-1))
-  lay = l.make(s)
-  y = try
-    lay(x)
-  catch e
-    @error l.str
-    return nothing
-  end
-  l.layer = striplazy(lay)  # is this a good idea?
+  l.layer == nothing || return l.layer(x)
+  lay = l.make(x)
+  y = lay(x)
+  l.layer = lay  # mutate after we know that call worked
   return y
 end
 
@@ -329,30 +320,9 @@ Flux.outputsize(Dense(2=>3), (4,))  # no nice error
 @autosize (3,5,6) LayerNorm(_,_)  # no complaint, but
 ans(rand(3,5,6))  # this fails
 
-
-
-```
-julia> Flux.outputsize(LayerNorm(2), (3,))
-(3,)
-
-julia> LayerNorm(2)(rand(Float32, 3))
-ERROR: DimensionMismatch: arrays could not be broadcast to a common size; got a dimension with lengths 2 and 3
-
-julia> BatchNorm(2)(fill(Flux.nil, 3)) |> size
-(3,)
-
-julia> BatchNorm(2)(rand(3))
-ERROR: arraysize: dimension out of range
-```
-
-
 =#
 
-# channelsize(x) = size(x, max(1, ndims(x)-1))
-
-using Functors: functor, @functor
-
-@functor LazyLayer # (layer,)
+@functor LazyLayer
 
 function striplazy(x)
   fs, re = functor(x)
@@ -360,8 +330,8 @@ function striplazy(x)
 end
 striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer
 
-# Could make LazyLayer usable outside of @autosize
-# For instance allow @lazy
+# Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))?
+# But then it will survive to produce weird structural gradients etc. 
 
 function Base.show(io::IO, l::LazyLayer)
   printstyled(io, "LazyLayer(", color=:light_black)

From 310b71eaed17434bd917bee8d6a26cb7a37be60b Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Wed, 5 Oct 2022 14:07:00 -0400
Subject: [PATCH 04/10] add tests, release note

---
 NEWS.md            |  3 +++
 src/outputsize.jl  | 10 -------
 test/outputsize.jl | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 9bf97ddb3b..d83e76d62a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
 # Flux Release Notes
 
+## v0.13.7
+* Added [`@autosize` macro](https://github.com/FluxML/Flux.jl/pull/2078)
+
 ## v0.13.4
 * Added [`PairwiseFusion` layer](https://github.com/FluxML/Flux.jl/pull/1983)
 
diff --git a/src/outputsize.jl b/src/outputsize.jl
index 8711a6c0e8..6caff3035f 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -261,16 +261,6 @@ function _underscoredepth(ex::Expr)
 end
 _underscoredepth(ex) = Int(ex == :_)
 
-#=
-
-@autosize (3,) Chain(one = Dense(_ => 4), two = softmax)  # needs kw
-@autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2)    # needs ->, block
-
-# here Parallel gets two inputs, no problem:
-@autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_))
-
-=#
-
 function _makefun(ex)
   T = Meta.isexpr(ex, :call) ? ex.args[1] : Type
   @gensym x s
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 667b5bad76..2084b26369 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -142,16 +142,81 @@ end
   m = LayerNorm(32)
   @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
   @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
+  m2 = LayerNorm(3, 2)
+  @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2)))
+  @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2)))
 
   m = BatchNorm(3)
   @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
   @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
+  @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
+  @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
 
   m = InstanceNorm(3)
   @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)
   @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
+  @test_throws Exception m(randn(Float32, 32, 32, 5, 1))
+  @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1))
 
   m = GroupNorm(16, 4)
   @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16)
   @test outputsize(m, (32, 32, 16); padbatch=true) == (32, 32, 16, 1)
+  @test_throws Exception m(randn(Float32, 32, 32, 15, 4))
+  @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4))
 end
+
+@testset "autosize macro" begin
+  m = @autosize (3,) Dense(_ => 4)
+  @test randn(3) |> m |> size == (4,)
+
+  m = @autosize (3, 1) Chain(Dense(_ => 4), Dense(4 => 10), softmax)
+  @test randn(3, 5) |> m |> size == (10, 5)
+  
+  m = @autosize (2, 3, 4, 5) Dense(_ => 10)  # goes by first dim, not 2nd-last
+  @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5)
+  
+  m = @autosize (9,) Dense(_ => div(_,2))
+  @test randn(9) |> m |> size == (4,)
+
+  m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax)  # needs kw
+  @test randn(3) |> m |> size == (4,)
+
+  m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2)    # needs ->, block
+  @test randn(3, 45) |> m |> size == (6, 45)
+
+  # here Parallel gets two inputs, no problem:
+  m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_))
+  @test randn(3) |> m |> size == (11,)
+  
+  # like Dense, LayerNorm goes by the first dimension:
+  m = @autosize (3, 4, 5) LayerNorm(_)
+  @test rand(3, 6, 7) |> m |> size == (3, 6, 7)
+
+  m = @autosize (3, 3, 10) LayerNorm(_, _)  # does not check that sizes match
+  @test rand(3, 3, 10) |> m |> size == (3, 3, 10)
+  
+  m = @autosize (3,) Flux.Bilinear(_ => 10)
+  @test randn(3) |> m |> size == (10,)
+
+  m = @autosize (3, 1) Flux.Bilinear(_ => 10)
+  @test randn(3, 4) |> m |> size == (10, 4)
+  
+  @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10)
+  
+  # first docstring example
+  m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false))
+  @test randn(3, 4) |> m |> size == (2, 4)
+  
+  # evil docstring example
+  img = [28, 28];
+  m = @autosize (img..., 1, 32) Chain(              # size is only needed at runtime
+         Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()),
+               p = MeanPool((3,3)),
+               b = BatchNorm(_),
+               f = Flux.flatten),
+         Dense(_ => _÷4, relu, init=Flux.rand32),   # can calculate output size _÷4
+         SkipConnection(Dense(_ => _, relu), +),
+         Dense(_ => 10),
+      ) |> gpu                                      # moves to GPU after initialisation
+  @test randn(Float32, img..., 1, 32) |> gpu |> m |> size == (10, 32)
+end
\ No newline at end of file

From 537b0118b09bb3751b2e62828a3b76fc8260c34e Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 6 Oct 2022 21:08:04 -0400
Subject: [PATCH 05/10] rrule errors, improvements, tests

---
 src/outputsize.jl  | 55 ++++++++++++++++++++++------------------------
 test/outputsize.jl | 26 ++++++++++++++++++++--
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/src/outputsize.jl b/src/outputsize.jl
index 6caff3035f..40170ad8d2 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -255,11 +255,11 @@ _makelazy(x) = x
 function _underscoredepth(ex::Expr)
   # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10
   ex.head in (:call, :kw, :(->), :block) || return 0
-  ex.args[1] == :(=>) && ex.args[2] == :_ && return 1
+  ex.args[1] === :(=>) && ex.args[2] === :_ && return 1
   m = maximum(_underscoredepth, ex.args)
   m == 0 ? 0 : m+1
 end
-_underscoredepth(ex) = Int(ex == :_)
+_underscoredepth(ex) = Int(ex === :_)
 
 function _makefun(ex)
   T = Meta.isexpr(ex, :call) ? ex.args[1] : Type
@@ -281,7 +281,7 @@ autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1))
 autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)
 autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1)
 
-_replaceunderscore(e, s) = e == :_ ? s : e
+_replaceunderscore(e, s) = e === :_ ? s : e
 _replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...)
 
 mutable struct LazyLayer
@@ -290,45 +290,42 @@ mutable struct LazyLayer
   layer
 end
 
-function (l::LazyLayer)(x::AbstractArray)
-  l.layer == nothing || return l.layer(x)
-  lay = l.make(x)
-  y = lay(x)
-  l.layer = lay  # mutate after we know that call worked
+@functor LazyLayer
+
+function (l::LazyLayer)(x::AbstractArray, ys::AbstractArray...)
+  l.layer === nothing || return l.layer(x, ys...)
+  made = l.make(x)  # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later.
+  y = made(x, ys...)
+  l.layer = made  # mutate after we know that call worked
   return y
 end
 
-#=
-
-Flux.outputsize(Chain(Dense(2=>3)), (4,))  # nice error
-Flux.outputsize(Dense(2=>3), (4,))  # no nice error
-@autosize (4,) Dense(2=>3)  # no nice error
-
-@autosize (3,) Dense(2 => _)  # shouldn't work, weird error
-
-
-@autosize (3,5,6) LayerNorm(_,_)  # no complaint, but
-ans(rand(3,5,6))  # this fails
-
-=#
-
-@functor LazyLayer
-
-function striplazy(x)
-  fs, re = functor(x)
+function striplazy(m)
+  fs, re = functor(m)
   re(map(striplazy, fs))
 end
-striplazy(l::LazyLayer) = l.layer == nothing ? error("should be initialised!") : l.layer
+function striplazy(l::LazyLayer)
+  l.layer === nothing || return l.layer
+  error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy")
+end
 
 # Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))?
 # But then it will survive to produce weird structural gradients etc. 
 
+function ChainRulesCore.rrule(l::LazyLayer, x)
+  l(x), _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.")
+end
+function ChainRulesCore.rrule(::typeof(striplazy), m)
+  striplazy(m), _ -> error("striplazy should never be used within a gradient")
+end
+
+params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")
 function Base.show(io::IO, l::LazyLayer)
   printstyled(io, "LazyLayer(", color=:light_black)
   if l.layer == nothing
-    printstyled(io, l.str, color=:red)
+    printstyled(io, l.str, color=:magenta)
   else
-    printstyled(io, l.layer, color=:green)
+    printstyled(io, l.layer, color=:cyan)
   end
   printstyled(io, ")", color=:light_black)
 end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 2084b26369..2e9595f699 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -169,7 +169,7 @@ end
   m = @autosize (3,) Dense(_ => 4)
   @test randn(3) |> m |> size == (4,)
 
-  m = @autosize (3, 1) Chain(Dense(_ => 4), Dense(4 => 10), softmax)
+  m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax)
   @test randn(3, 5) |> m |> size == (10, 5)
   
   m = @autosize (2, 3, 4, 5) Dense(_ => 10)  # goes by first dim, not 2nd-last
@@ -201,6 +201,9 @@ end
   m = @autosize (3, 1) Flux.Bilinear(_ => 10)
   @test randn(3, 4) |> m |> size == (10, 4)
   
+  m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10))  # Bilinear gets two inputs
+  @test randn(3, 4) |> m |> size == (10, 4)
+  
   @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10)
   
   # first docstring example
@@ -219,4 +222,23 @@ end
          Dense(_ => 10),
       ) |> gpu                                      # moves to GPU after initialisation
   @test randn(Float32, img..., 1, 32) |> gpu |> m |> size == (10, 32)
-end
\ No newline at end of file
+end
+
+@testset "LazyLayer" begin
+  # This is what `@autosize` uses, ideally nobody should make these by hand!
+  # Implicitly testeed by the macro, explicitly here too: 
+  ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)", x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init=ones), nothing)
+
+  lm = Chain(ld, Flux.Scale(3))
+  @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))"
+  @test_throws Exception Flux.striplazy(lm)
+
+  @test lm([1,2]) == [3,3,3]
+
+  @test string(ld) == "LazyLayer(Dense(2 => 3, relu))"
+  @test Flux.striplazy(ld) isa Dense
+
+  @test_throws Exception Flux.params(lm)
+  @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2])
+  @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld)
+end

From b2016e20430d3295518faad1d8120d85fec44a73 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 6 Oct 2022 23:27:22 -0400
Subject: [PATCH 06/10] documentation

---
 docs/src/outputsize.md | 85 +++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index d692816b46..e69f4c9d7e 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -1,47 +1,72 @@
 # Shape Inference
 
-To help you generate models in an automated fashion, [`Flux.outputsize`](@ref) lets you 
-calculate the size returned produced by layers for a given size input.
-This is especially useful for layers like [`Conv`](@ref).
+Flux has some tools to help generate models in an automated fashion, by inferring the size
+of arrays that layers will recieve, without doing any computation. 
+This is especially useful for convolutional models, where the same [`Conv`](@ref) layer
+accepts any size of image, but the next layer may not. 
 
-It works by passing a "dummy" array into the model that preserves size information without running any computation.
-`outputsize(f, inputsize)` works for all layers (including custom layers) out of the box.
-By default, `inputsize` expects the batch dimension,
-but you can exclude the batch size with `outputsize(f, inputsize; padbatch=true)` (assuming it to be one).
+The higher-level one is a macro [`@autosize`](@ref) which acts on the code defining the layers,
+and replaces each appearance of `_` with the relevant size. A simple example might be:
 
-Using this utility function lets you automate model building for various inputs like so:
 ```julia
-"""
-    make_model(width, height, inchannels, nclasses;
-               layer_config = [16, 16, 32, 32, 64, 64])
+@autosize (28, 28, 1, 32) Chain(Conv((3, 3), _ => 5, relu, stride=2), Flux.flatten, Dense(_ => 10))
+```
+
+The size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the
+layer constructors containing `_` must be explicitly written out -- the macro sees the code as written.
 
-Create a CNN for a given set of configuration parameters.
+This relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly:
 
-# Arguments
-- `width`: the input image width
-- `height`: the input image height
-- `inchannels`: the number of channels in the input image
-- `nclasses`: the number of output classes
-- `layer_config`: a vector of the number of filters per each conv layer
+```julia
+c = Conv((3, 3), 1 => 5, relu, stride=2)
+Flux.outputsize(c, (28, 28, 1, 32))  # returns (13, 13, 5, 32)
+```
+
+The function `outputsize` works by passing a "dummy" array into the model, which propagates through very cheaply.
+It should work for all layers, including custom layers, out of the box.
+
+An example of how to automate model building is this:
+```julia
 """
-function make_model(width, height, inchannels, nclasses;
-                    layer_config = [16, 16, 32, 32, 64, 64])
-  # construct a vector of conv layers programmatically
-  conv_layers = [Conv((3, 3), inchannels => layer_config[1])]
-  for (infilters, outfilters) in zip(layer_config, layer_config[2:end])
-    push!(conv_layers, Conv((3, 3), infilters => outfilters))
+    make_model(width, height, [inchannels, nclasses; layer_config])
+
+Create a CNN for a given set of configuration parameters. Arguments:
+- `width`, `height`: the input image size in pixels
+- `inchannels`: the number of channels in the input image, default `1`
+- `nclasses`: the number of output classes, default `10`
+- Keyword `layer_config`: a vector of the number of filters per layer, default `[16, 16, 32, 64]`
+"""
+function make_model(width, height, inchannels = 1, nclasses = 10;
+                    layer_config = [16, 16, 32, 64])
+  # construct a vector of layers:
+  conv_layers = []
+  push!(conv_layers, Conv((5, 5), inchannels => layer_config[1], relu, pad=SamePad()))
+  for (inch, outch) in zip(layer_config, layer_config[2:end])
+    push!(conv_layers, Conv((3, 3), inch => outch, sigmoid, stride=2))
   end
 
-  # compute the output dimensions for the conv layers
-  # use padbatch=true to set the batch dimension to 1
-  conv_outsize = Flux.outputsize(conv_layers, (width, height, nchannels); padbatch=true)
+  # compute the output dimensions after these conv layers:
+  conv_outsize = Flux.outputsize(conv_layers, (width, height, inchannels); padbatch=true)
+
+  # use this to define appropriate Dense layer:
+  last_layer = Dense(prod(conv_outsize) => nclasses)
+  return Chain(conv_layers..., Flux.flatten, last_layer)
+end
+
+make_model(28, 28, 3, layer_config = [8, 17, 33, 65])
+```
+
+Alternatively, using the macro, the definition of `make_model` could end with:
 
-  # the input dimension to Dense is programatically calculated from
-  #  width, height, and nchannels
-  return Chain(conv_layers..., Dense(prod(conv_outsize) => nclasses))
+```
+  # compute the output dimensions & construct appropriate Dense layer:
+  return @autosize (width, height, inchannels, 1) Chain(conv_layers..., Flux.flatten, Dense(_ => nclasses))
 end
 ```
 
+### Listing
+
 ```@docs
+Flux.@autosize
 Flux.outputsize
 ```

From e2ab1ece4ea1ca1ba39188d44df96aefea3d9018 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 7 Oct 2022 09:50:03 -0400
Subject: [PATCH 07/10] tweaks

---
 docs/src/outputsize.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index e69f4c9d7e..4e41b51648 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -5,17 +5,18 @@ of arrays that layers will recieve, without doing any computation.
 This is especially useful for convolutional models, where the same [`Conv`](@ref) layer
 accepts any size of image, but the next layer may not. 
 
-The higher-level one is a macro [`@autosize`](@ref) which acts on the code defining the layers,
-and replaces each appearance of `_` with the relevant size. A simple example might be:
+The higher-level tool is a macro [`@autosize`](@ref) which acts on the code defining the layers,
+and replaces each appearance of `_` with the relevant size. This simple example returns a model
+with `Dense(845 => 10)` as the last layer:
 
 ```julia
 @autosize (28, 28, 1, 32) Chain(Conv((3, 3), _ => 5, relu, stride=2), Flux.flatten, Dense(_ => 10))
 ```
 
-The size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the
+The input size may be provided at runtime, like `@autosize (sz..., 1, 32) Chain(Conv(`..., but all the
 layer constructors containing `_` must be explicitly written out -- the macro sees the code as written.
 
-This relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly:
+This macro relies on a lower-level function [`outputsize`](@ref Flux.outputsize), which you can also use directly:
 
 ```julia
 c = Conv((3, 3), 1 => 5, relu, stride=2)
@@ -53,7 +54,9 @@ function make_model(width, height, inchannels = 1, nclasses = 10;
   return Chain(conv_layers..., Flux.flatten, last_layer)
 end
 
-make_model(28, 28, 3, layer_config = [8, 17, 33, 65])
+m = make_model(28, 28, 3, layer_config = [9, 17, 33, 65])
+
+Flux.outputsize(m, (28, 28, 3, 42)) == (10, 42) == size(m(randn(Float32, 28, 28, 3, 42)))
 ```
 
 Alternatively, using the macro, the definition of `make_model` could end with:

From 67ea6a7ce575878b40f088666da676c53814ff89 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 8 Oct 2022 20:33:58 -0400
Subject: [PATCH 08/10] add jldoctest; output = false

---
 docs/src/outputsize.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index 4e41b51648..768815c14e 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -27,7 +27,7 @@ The function `outputsize` works by passing a "dummy" array into the model, which
 It should work for all layers, including custom layers, out of the box.
 
 An example of how to automate model building is this:
-```julia
+```jldoctest; output = false
 """
     make_model(width, height, [inchannels, nclasses; layer_config])
 
@@ -57,6 +57,10 @@ end
 m = make_model(28, 28, 3, layer_config = [9, 17, 33, 65])
 
 Flux.outputsize(m, (28, 28, 3, 42)) == (10, 42) == size(m(randn(Float32, 28, 28, 3, 42)))
+
+# output
+
+true
 ```
 
 Alternatively, using the macro, the definition of `make_model` could end with:

From 936bb5b0e91567f4398f644a41a1bb739fb50688 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 10 Oct 2022 09:07:35 -0400
Subject: [PATCH 09/10] tweak

---
 docs/src/outputsize.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index 768815c14e..037a90d6aa 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -35,7 +35,7 @@ Create a CNN for a given set of configuration parameters. Arguments:
 - `width`, `height`: the input image size in pixels
 - `inchannels`: the number of channels in the input image, default `1`
 - `nclasses`: the number of output classes, default `10`
-- Keyword `layer_config`: a vector of the number of filters per layer, default `[16, 16, 32, 64]`
+- Keyword `layer_config`: a vector of the number of channels per layer, default `[16, 16, 32, 64]`
 """
 function make_model(width, height, inchannels = 1, nclasses = 10;
                     layer_config = [16, 16, 32, 64])

From 5c1ed685351ee7a10b9e73bbc1e914b47a160fb9 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 10 Oct 2022 11:15:58 -0400
Subject: [PATCH 10/10] using Flux

---
 docs/src/outputsize.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/outputsize.md b/docs/src/outputsize.md
index 037a90d6aa..9376db9ab8 100644
--- a/docs/src/outputsize.md
+++ b/docs/src/outputsize.md
@@ -27,7 +27,7 @@ The function `outputsize` works by passing a "dummy" array into the model, which
 It should work for all layers, including custom layers, out of the box.
 
 An example of how to automate model building is this:
-```jldoctest; output = false
+```jldoctest; output = false, setup = :(using Flux)
 """
     make_model(width, height, [inchannels, nclasses; layer_config])