From 5531d91d7add47d3b12cbe91155b474575df2568 Mon Sep 17 00:00:00 2001
From: Max Freudenberg <max.freudenberg@protonmail.com>
Date: Mon, 19 Apr 2021 13:49:11 +0200
Subject: [PATCH 1/3] introduce trilinear upsampling

---
 src/upsample.jl | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/upsample.jl b/src/upsample.jl
index 43c52ae49..b58434cbd 100644
--- a/src/upsample.jl
+++ b/src/upsample.jl
@@ -1,5 +1,6 @@
 export upsample_nearest, ∇upsample_nearest,
     upsample_bilinear, ∇upsample_bilinear,
+    upsample_trilinear, ∇upsample_trilinear,
     pixel_shuffle
 
 """
@@ -9,7 +10,7 @@ export upsample_nearest, ∇upsample_nearest,
 Upsamples the array `x` by integer multiples along the first `S` dimensions.
 Subsequent dimensions of `x` are not altered.
 
-Either the `scale` factors or the final output `size` can be specified. 
+Either the `scale` factors or the final output `size` can be specified.
 
 See also [`upsample_bilinear`](@ref), for two dimensions of an `N=4` array.
 
@@ -257,6 +258,88 @@ function rrule(::typeof(upsample_bilinear), x; size)
     return Ω, upsample_bilinear_pullback
 end
 
+###########
+# trilinear
+###########
+"""
+    upsample_trilinear(x::AbstractArray{T,5}, scale::NTuple{3,Real})
+    upsample_trilinear(x::AbstractArray{T,5}; size::NTuple{3,Integer})
+
+Upsamples the first 3 dimensions of the array `x` by the upsample factors stored in `scale`,
+using trilinear interpolation. As an alternative to using `scale`, the resulting image `size`
+can be directly specified with a keyword argument.
+
+The size of the output is equal to
+`(scale[1]*S1, scale[2]*S2, scale[3]*S3, S4, S5)`, where `S1, S2, S3, S4, S5 = size(x)`.
+
+# Examples
+
+```julia
+upsample_trilinear(x, (2, 3, 4))
+upsample_trilinear(x; size=(4, 9, 11))  # specify ouput size instead
+upsample_trilinear(x, (2.5, 3.5, pi))  # non-integer scaling factors are allowed
+```
+"""
+function upsample_trilinear(x::AbstractArray{<:Any,5}, scale::NTuple{3,Real})
+    outsize = ntuple(i -> floor(Int, scale[i] * Base.size(x, i)), 3)
+    return upsample_trilinear(x; size=outsize)
+end
+
+upsample_trilinear(x, scale::Real) = upsample_trilinear(x, (scale,scale,scale))
+
+function upsample_trilinear(x::AbstractArray{T,5}; size::NTuple{3,Integer}) where T
+    w,h,d,c,n = Base.size(x)
+    if (w,h,d) == size
+        return x
+    end
+    y = similar(x, T, size..., c, n)
+    return upsample_trilinear_whdcn!(y, x)
+end
+
+function upsample_trilinear(x::AbstractArray{T,5}; size::NTuple{3,Integer}) where T<:Integer
+    y = float.(x)
+    res = upsample_trilinear(y; size=size)
+    return round.(T, res)
+end
+
+# placeholder for CPU implementation
+# which is overloaded in NNlibCUDA
+# I think a CPU implementation doesn't make sense, as it will be way too slow
+# for any meaningful data
+function upsample_trilinear_whdcn! end
+
+"""
+    ∇upsample_trilinear(Δ::AbstractArray{T,5}; size::NTuple{3,Integer}) where T
+
+# Arguments
+- `Δ`: Incoming gradient array, backpropagated from downstream layers
+- `size`: Lateral size & depth (W,H,D) of the image upsampled in the first place
+
+# Outputs
+- `dx`: Downsampled version of `Δ`
+"""
+function ∇upsample_trilinear(Δ::AbstractArray{T,5}; size::NTuple{3,Integer}) where T
+    w, h, d, c, n = Base.size(Δ)
+    out_w, out_h, out_d = size
+    if (w,h,d) == (out_w, out_h, out_d)
+        return Δ
+    end
+    dx = zero(similar(Δ, T, size..., c, n))
+    return ∇upsample_trilinear_whdcn!(dx, Δ)
+end
+
+# placeholder
+function ∇upsample_trilinear_whdcn! end
+
+function rrule(::typeof(upsample_trilinear), x; size)
+    Ω = upsample_trilinear(x; size=size)
+    function upsample_trilinear_pullback(Δ)
+        (NO_FIELDS, ∇upsample_trilinear(Δ; size=(Base.size(x,1), Base.size(x,2), Base.size(x,3))))
+    end
+    return Ω, upsample_trilinear_pullback
+end
+
+
 """
     pixel_shuffle(x, r::Integer)
 

From d44fb88445625678fa31ae7cb8d85823e979d453 Mon Sep 17 00:00:00 2001
From: Max Freudenberg <max.freudenberg@protonmail.com>
Date: Sun, 25 Apr 2021 17:56:14 +0200
Subject: [PATCH 2/3] add trilinear CPU implementation

---
 src/upsample.jl  | 83 ++++++++++++++++++++++++++++++++++++++++++++----
 test/upsample.jl | 32 ++++++++++++++++++-
 2 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/src/upsample.jl b/src/upsample.jl
index b58434cbd..3798bf3aa 100644
--- a/src/upsample.jl
+++ b/src/upsample.jl
@@ -302,11 +302,44 @@ function upsample_trilinear(x::AbstractArray{T,5}; size::NTuple{3,Integer}) wher
     return round.(T, res)
 end
 
-# placeholder for CPU implementation
-# which is overloaded in NNlibCUDA
-# I think a CPU implementation doesn't make sense, as it will be way too slow
-# for any meaningful data
-function upsample_trilinear_whdcn! end
+function upsample_trilinear_whdcn!(output::AbstractArray{T,5}, input::AbstractArray{T,5}) where T
+    size(input)[4:5] == size(output)[4:5] || error("Number of input and output channels and batches must match. Got input $(size(input)) and output $(size(output))")
+    in_w, in_h, in_d, channels, batches = size(input)
+    # treat batch and channel dimension as one for better parallelization granularity
+    channels *= batches
+    out_w, out_h, out_d, _, _ = size(output)
+    output_slice_size = out_h * out_w * out_d
+
+    # T() and // so that we can handle rationals (super slow)
+    width_scale  = T((in_w - 1) // (out_w - 1))
+    height_scale = T((in_h - 1) // (out_h - 1))
+    depth_scale  = T((in_d - 1) // (out_d - 1))
+
+    @inline idx(c, d, h, w) = c * in_d * in_h * in_w + d * in_h * in_w + h * in_w + w + 1
+
+    @inbounds Threads.@threads for c in 0:channels-1
+        for od in 0:out_d-1
+            id0, id1, d0lambda, d1lambda = compute_source_index_and_lambda(depth_scale, od, in_d, out_d)
+            for oh in 0:out_h-1
+                ih0, ih1, h0lambda, h1lambda = compute_source_index_and_lambda(height_scale, oh, in_h, out_h)
+                for ow in 0:out_w-1
+                    iw0, iw1, w0lambda, w1lambda = compute_source_index_and_lambda(width_scale, ow, in_w, out_w)
+                    output_offset = c * output_slice_size + od * out_w * out_h + oh * out_w + ow + 1
+                    output[output_offset] =
+                        d0lambda * h0lambda * w0lambda * input[idx(c, id0, ih0, iw0)] + # d0 * h0 * w0 * i000
+                        d0lambda * h0lambda * w1lambda * input[idx(c, id0, ih0, iw1)] + # d0 * h0 * w1 * i001
+                        d0lambda * h1lambda * w0lambda * input[idx(c, id0, ih1, iw0)] + # d0 * h1 * w0 * i010
+                        d0lambda * h1lambda * w1lambda * input[idx(c, id0, ih1, iw1)] + # d0 * h1 * w1 * i011
+                        d1lambda * h0lambda * w0lambda * input[idx(c, id1, ih0, iw0)] + # d1 * h0 * w0 * i100
+                        d1lambda * h0lambda * w1lambda * input[idx(c, id1, ih0, iw1)] + # d1 * h0 * w1 * i101
+                        d1lambda * h1lambda * w0lambda * input[idx(c, id1, ih1, iw0)] + # d1 * h1 * w0 * i110
+                        d1lambda * h1lambda * w1lambda * input[idx(c, id1, ih1, iw1)]   # d1 * h1 * w1 * i111
+                end
+            end
+        end
+    end
+    return output
+end
 
 """
     ∇upsample_trilinear(Δ::AbstractArray{T,5}; size::NTuple{3,Integer}) where T
@@ -328,8 +361,44 @@ function ∇upsample_trilinear(Δ::AbstractArray{T,5}; size::NTuple{3,Integer})
     return ∇upsample_trilinear_whdcn!(dx, Δ)
 end
 
-# placeholder
-function ∇upsample_trilinear_whdcn! end
+function ∇upsample_trilinear_whdcn!(dx::AbstractArray{T,5}, Δ::AbstractArray{T,5}) where T
+    size(dx)[4:5] == size(Δ)[4:5] || error("Number of input and output channels and batches must match. Got dx $(size(dx)) and Δ $(size(Δ))")
+    in_w, in_h, in_d, channels, batches = size(dx)
+    # treat batch and channel dimension as one for better parallelization granularity
+    channels *= batches
+    out_w, out_h, out_d, _, _ = size(Δ)
+    output_slice_size = out_h * out_w * out_d
+
+    # T() and // so that we can handle rationals (super slow)
+    width_scale  = T((in_w - 1) // (out_w - 1))
+    height_scale = T((in_h - 1) // (out_h - 1))
+    depth_scale  = T((in_d - 1) // (out_d - 1))
+
+    @inline idx(c, d, h, w) = c * in_d * in_h * in_w + d * in_h * in_w + h * in_w + w + 1
+
+    @inbounds Threads.@threads for c in 0:channels-1
+        for od in 0:out_d-1
+            id0, id1, d0lambda, d1lambda = compute_source_index_and_lambda(depth_scale, od, in_d, out_d)
+            for oh in 0:out_h-1
+                ih0, ih1, h0lambda, h1lambda = compute_source_index_and_lambda(height_scale, oh, in_h, out_h)
+                for ow in 0:out_w-1
+                    iw0, iw1, w0lambda, w1lambda = compute_source_index_and_lambda(width_scale, ow, in_w, out_w)
+                    output_offset = c * output_slice_size + od * out_w * out_h + oh * out_w + ow + 1
+                    Δ_value = Δ[output_offset]
+                    dx[idx(c, id0, ih0, iw0)] += d0lambda * h0lambda * w0lambda * Δ_value  # /* i000 */
+                    dx[idx(c, id0, ih0, iw1)] += d0lambda * h0lambda * w1lambda * Δ_value  # /* i001 */
+                    dx[idx(c, id0, ih1, iw0)] += d0lambda * h1lambda * w0lambda * Δ_value  # /* i010 */
+                    dx[idx(c, id0, ih1, iw1)] += d0lambda * h1lambda * w1lambda * Δ_value  # /* i011 */
+                    dx[idx(c, id1, ih0, iw0)] += d1lambda * h0lambda * w0lambda * Δ_value  # /* i100 */
+                    dx[idx(c, id1, ih0, iw1)] += d1lambda * h0lambda * w1lambda * Δ_value  # /* i101 */
+                    dx[idx(c, id1, ih1, iw0)] += d1lambda * h1lambda * w0lambda * Δ_value  # /* i110 */
+                    dx[idx(c, id1, ih1, iw1)] += d1lambda * h1lambda * w1lambda * Δ_value  # /* i111 */
+                end
+            end
+        end
+    end
+    return dx
+end
 
 function rrule(::typeof(upsample_trilinear), x; size)
     Ω = upsample_trilinear(x; size=size)
diff --git a/test/upsample.jl b/test/upsample.jl
index d808aeb89..a58bdd38e 100644
--- a/test/upsample.jl
+++ b/test/upsample.jl
@@ -5,7 +5,7 @@
     y = upsample_nearest(x, (2,3))
     @test size(y) == (4,6,1,1)
     ∇upsample_nearest(y, (2,3)) == [6 12; 18 24]
-    
+
     gradtest(x -> upsample_nearest(x, (2,3)), rand(2,2,1,1))
 
     y2 = upsample_nearest(x, size=(4,6))
@@ -65,6 +65,36 @@ end
     @test y == y_true_int
 end
 
+@testset "Trilinear upsampling" begin
+    # Layout: WHDCN, where D is depth
+    # we generate data which is constant along W & H and differs in D
+    # then we upsample along all dimensions
+    x = ones(Float32, 3,3,3,1,1)
+    x[:,:,1,:,:] .= 1.
+    x[:,:,2,:,:] .= 2.
+    x[:,:,3,:,:] .= 3.
+
+    y_true = ones(Float32, 5,5,5,1,1)
+    y_true[:,:,1,:,:] .= 1.
+    y_true[:,:,2,:,:] .= 1.5
+    y_true[:,:,3,:,:] .= 2.
+    y_true[:,:,4,:,:] .= 2.5
+    y_true[:,:,5,:,:] .= 3.
+
+    y = upsample_trilinear(x; size=(5,5,5))
+
+    @test size(y) == size(y_true)
+    @test eltype(y) == Float32
+    @test collect(y) ≈ collect(y_true)
+
+    # this test only works when align_corners=false (not present for CPU yet)
+    # o = ones(Float32,8,8,8,1,1)
+    # grad_true = 8*ones(Float32,4,4,4,1,1)
+    # @test ∇upsample_trilinear(o; size=(4,4,4)) ≈ grad_true
+
+    gradtest(x -> upsample_trilinear(x, (2,2,2)), x, atol=1e-3)
+end
+
 @testset "pixel_shuffle" begin
     x = reshape(1:16, (2, 2, 4, 1))
     # [:, :, 1, 1] =

From a2bfabdbdcdb12bde084116ad4887fd404f1a24e Mon Sep 17 00:00:00 2001
From: Max Freudenberg <max.freudenberg@protonmail.com>
Date: Sun, 25 Apr 2021 18:18:17 +0200
Subject: [PATCH 3/3] perform gradtest with F64

---
 test/upsample.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/upsample.jl b/test/upsample.jl
index a58bdd38e..7440bdc39 100644
--- a/test/upsample.jl
+++ b/test/upsample.jl
@@ -92,7 +92,8 @@ end
     # grad_true = 8*ones(Float32,4,4,4,1,1)
     # @test ∇upsample_trilinear(o; size=(4,4,4)) ≈ grad_true
 
-    gradtest(x -> upsample_trilinear(x, (2,2,2)), x, atol=1e-3)
+    x = Float64.(x)
+    gradtest(x -> upsample_trilinear(x, (2,2,2)), x)
 end
 
 @testset "pixel_shuffle" begin