From 28ff9a5ef8f6dbccf9311caad55b6921b30e6da3 Mon Sep 17 00:00:00 2001
From: Mohamed Tarek <mohamed82008@gmail.com>
Date: Tue, 9 Feb 2021 02:40:15 +1100
Subject: [PATCH 01/25] proof of concept

---
 Project.toml                   |   6 +-
 src/AbstractDifferentiation.jl | 329 ++++++++++++++++++++++++++++++++-
 test/runtests.jl               |  77 +++++++-
 3 files changed, 408 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 184f921..95fb5a4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,11 +3,15 @@ uuid = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 authors = ["Mohamed Tarek <mohamed82008@gmail.com> and contributors"]
 version = "0.1.0"
 
+[deps]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
 [compat]
 julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 
 [targets]
-test = ["Test"]
+test = ["Test", "FiniteDifferences"]
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 93d7351..ad539ca 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -1,5 +1,332 @@
 module AbstractDifferentiation
 
-# Write your package code here.
+using LinearAlgebra
 
+export AD
+
+const AD = AbstractDifferentiation
+
+abstract type AbstractBackend end
+abstract type AbstractFiniteDifference <: AbstractBackend end
+abstract type AbstractForwardMode <: AbstractBackend end
+abstract type AbstractReverseMode <: AbstractBackend end
+
+struct HigherOrderBackend{B} <: AbstractBackend
+    backends::B
+end
+reduceorder(b::AbstractBackend) = b
+function reduceorder(b::HigherOrderBackend)
+    return HigherOrderBackend(reverse(Base.tail(reverse(b.backends))))
+end
+lowest(b::AbstractBackend) = b
+lowest(b::HigherOrderBackend) = b.backends[end]
+secondlowest(b::HigherOrderBackend) = lowest(reduceorder(b))
+
+# If the primal value is in y, extract it.
+# Otherwise, re-compute it, e.g. in finite diff.
+primalvalue(::AbstractFiniteDifference, ::Any, f, xs) = f(xs...)
+primalvalue(::AbstractBackend, ys, ::Any, ::Any) = primalvalue(ys)
+primalvalue(x::Tuple) = map(primalvalue, x)
+primalvalue(x) = x
+
+function derivative(ab::AbstractBackend, f, xs::Number...)
+    return getindex.(jacobian(lowest(ab), f, xs...), 1)
+end
+function gradient(ab::AbstractBackend, f, xs...)
+    return adjoint.(jacobian(lowest(ab), f, xs...))
+end
+function jacobian(ab::AbstractBackend, f, xs...) end
+function hessian(ab::AbstractBackend, f, xs...)
+    return jacobian(secondlowest(ab), (xs...,) -> begin
+        gradient(lowest(ab), f, xs...)
+    end, xs...)
+end
+
+function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
+    value, jacs = value_and_jacobian(lowest(ab), f, xs...)
+    return value[1], getindex.(jacs, 1)
+end
+function value_and_gradient(ab::AbstractBackend, f, xs...)
+    value, jacs = value_and_jacobian(lowest(ab), f, xs...)
+    return value, adjoint.(jacs)
+end
+function value_and_jacobian(ab::AbstractBackend, f, xs...)
+    local value
+    primalcalled = false
+    jacs = jacobian(lowest(ab), (_xs...,) -> begin
+        v = f(_xs...)
+        if !primalcalled
+            value = primalvalue(ab, v, f, xs)
+            primalcalled = true
+        end
+        return v
+    end, xs...)
+    return value, jacs
+end
+function value_and_hessian(ab::AbstractBackend, f, xs...)
+    local value
+    primalcalled = false
+    hess = jacobian(secondlowest(ab), (_xs...,) -> begin
+        v, g = value_and_gradient(lowest(ab), f, _xs...)
+        if !primalcalled
+            value = primalvalue(ab, v, f, xs)
+            primalcalled = true
+        end
+        return g
+    end, xs...)
+    return value, hess
+end
+function value_and_hessian(ab::HigherOrderBackend, f, xs...)
+    local value
+    primalcalled = false
+    hess = jacobian(secondlowest(ab), (_xs...,) -> begin
+        v, g = value_and_gradient(lowest(ab), f, _xs...)
+        if !primalcalled
+            value = primalvalue(ab, v, f, xs)
+            primalcalled = true
+        end
+        return g
+    end, xs...)
+    return value, hess
+end
+function value_gradient_and_hessian(ab::AbstractBackend, f, xs...)
+    local value
+    primalcalled = false
+    grads, hess = value_and_jacobian(secondlowest(ab), (_xs...,) -> begin
+        v, g = value_and_gradient(lowest(ab), f, _xs...)
+        if !primalcalled
+            value = primalvalue(secondlowest(ab), v, f, xs)
+            primalcalled = false
+        end
+        return g
+    end, xs...)
+    return value, grads, hess
+end
+function value_gradient_and_hessian(ab::HigherOrderBackend, f, xs...)
+    local value
+    primalcalled = false
+    grads, hess = value_and_jacobian(secondlowest(ab), (_xs...,) -> begin
+        v, g = value_and_gradient(lowest(ab), f, _xs...)
+        if !primalcalled
+            value = primalvalue(secondlowest(ab), v, f, xs)
+            primalcalled = true
+        end
+        return g
+    end, xs...)
+    return value, grads, hess
+end
+
+function pushforward_function(
+    ab::AbstractBackend,
+    f,
+    xs::Union{Number, AbstractArray{<:Number}}...,
+)
+    return (ds) -> begin
+        return jacobian(lowest(ab), (xds...,) -> begin
+            if ds isa Tuple
+                @assert length(xs) == length(ds)
+                newxs = xs .+ ds .* xds
+                return f(newxs...)
+            else
+                @assert length(xs) == length(xds) == 1
+                newx = xs[1] + ds * xds[1]
+                return f(newx)
+            end
+        end, _zero.(xs, ds)...)
+    end
+end
+function value_and_pushforward_function(
+    ab::AbstractBackend,
+    f,
+    xs::Union{Number, AbstractArray{<:Number}}...,
+)
+    return (ds) -> begin
+        @assert ds isa Tuple && length(ds) == length(xs)
+        return value_and_jacobian(lowest(ab), (xds...,) -> begin
+            if ds isa Tuple
+                @assert length(xs) == length(ds)
+                newxs = xs .+ ds .* xds
+                return f(newxs...)
+            else
+                @assert length(xs) == length(xds) == 1
+                newx = xs[1] + ds * xds[1]
+                return f(newx)
+            end
+        end, _zero.(xs, ds)...)
+    end
+end
+
+_zero(::Number, d::Number) = zero(d)
+_zero(::Number, d::AbstractVector) = zero(d)
+_zero(::AbstractVector, d::AbstractVector) = zero(eltype(d))
+_zero(::AbstractVector, d::AbstractMatrix) = zero(similar(d, size(d, 2)))
+_zero(::AbstractMatrix, d::AbstractMatrix) = zero(d)
+_zero(::Any, d::Any) = zero(d)
+
+function pullback_function(ab::AbstractBackend, f, xs...)
+    return (ws) -> begin
+        jacs = jacobian(lowest(ab), (xs...,) -> begin
+            vs = f(xs...)
+            if ws isa Tuple
+                @assert length(vs) == length(ws)
+                return sum(zip(vs, ws)) do v, w
+                    if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
+                        return w' * v
+                    else
+                        # for arbitrary arrays
+                        return dot(w, v)
+                    end
+                end
+            else
+                w, v = ws, vs
+                if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
+                    return w' * v
+                else
+                    # for arbitrary arrays
+                    return dot(w, v)
+                end
+            end
+        end, xs...)
+        return adjoint.(jacs)
+    end
+end
+function value_and_pullback_function(
+    ab::AbstractBackend,
+    f,
+    xs...,
+)
+    return (ws) -> begin
+        local value
+        primalcalled = false
+        jacs = jacobian(lowest(ab), (_xs...,) -> begin
+            vs = f(_xs...)
+            if !primalcalled
+                value = primalvalue(lowest(ab), vs, f, xs)
+                primalcalled = true
+            end
+            if ws isa Tuple
+                @assert length(vs) == length(ws)
+                return sum(zip(vs, ws)) do v, w
+                    if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
+                        return w' * v
+                    else
+                        # for arbitrary arrays
+                        return dot(w, v)
+                    end
+                end
+            else
+                w, v = ws, vs
+                if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
+                    return w' * v
+                else
+                    # for arbitrary arrays
+                    return dot(w, v)
+                end
+            end
+        end, xs...)
+        return value, adjoint.(jacs)
+    end
+end
+
+struct LazyDerivative{B, F, X}
+    backend::B
+    f::F
+    xs::X
+end
+function Base.:*(d::LazyDerivative, y)
+    return derivative(d.ab, d.f, d.xs...) * y
+end
+function Base.:*(y, d::LazyDerivative)
+    return y * derivative(d.ab, d.f, d.xs...)
+end
+
+struct LazyGradient{B, F, X}
+    backend::B
+    f::F
+    xs::X
+end
+Base.:*(d::LazyGradient, y) = gradient(d.ab, d.f, d.xs...) * y
+Base.:*(y, d::LazyGradient) = y * gradient(d.ab, d.f, d.xs...)
+
+struct LazyJacobian{B, F, X}
+    backend::B
+    f::F
+    xs::X
+end
+function Base.:*(d::LazyJacobian, ys)
+    return pushforward_function(d.ab, d.f, d.xs...)(ys)
+end
+function Base.:*(ys, d::LazyJacobian)
+    if ys isa Tuple
+        ya = adjoint.(ys)
+    else
+        ya = adjoint(ys)
+    end
+    return pullback_function(d.ab, d.f, d.xs...)(ya)
+end
+
+struct LazyHessian{B, F, X}
+    backend::B
+    f::F
+    xs::X
+end
+function Base.:*(d::LazyHessian, ys)
+    return pushforward_function(
+        secondlowest(d.ab),
+        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
+        d.xs...,
+    )(ys)
+end
+function Base.:*(ys, d::LazyHessian)
+    if ys isa Tuple
+        ya = adjoint.(ys)
+    else
+        ya = adjoint(ys)
+    end
+    return pullback_function(
+        secondlowest(d.ab),
+        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
+        d.xs...,
+    )(ya)
 end
+
+function lazyderivative(ab::AbstractBackend, f, xs::Number...)
+    return LazyDerivative(ab, f, xs)
+end
+function lazygradient(ab::AbstractBackend, f, xs...)
+    return LazyGradient(ab, f, xs)
+end
+function lazyhessian(ab::AbstractBackend, f, xs...)
+    return LazyHessian(ab, f, xs)
+end
+function lazyjacobian(ab::AbstractBackend, f, xs...)
+    return LazyJacobian(ab, f, xs)
+end
+
+struct D{B, F}
+    backend::B
+    f::F
+end
+D(b::AbstractBackend, d::D) = H(HigherOrderBackend((b, d.b)), d.f)
+D(d::D) = H(HigherOrderBackend((d.backend, d.backend)), d.f)
+function (d::D)(xs...; lazy = true)
+    if lazy
+        return lazyjacobian(d.ab, d.f, xs...)
+    else
+        return jacobian(d.ab, d.f, xs...)
+    end
+end
+
+struct H{B, F}
+    backend::B
+    f::F
+end
+function (h::H)(xs...; lazy = true)
+    if lazy
+        return lazyhessian(h.ab, h.f, xs...)
+    else
+        return hessian(h.ab, h.f, xs...)
+    end
+end
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index a8a79fc..d7015ea 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,79 @@
 using AbstractDifferentiation
-using Test
+using Test, FiniteDifferences, LinearAlgebra
+
+const FDM = FiniteDifferences
+struct FDMBackend{A} <: AD.AbstractFiniteDifference
+    alg::A
+end
+FDMBackend() = FDMBackend(central_fdm(5, 1))
+const fdm_backend = FDMBackend()
+
+# Minimal interface
+function AD.jacobian(ab::FDMBackend, f, xs...)
+    return jacobian(ab.alg, f, xs...)
+end
+
+fder(x, y) = exp(y) * x + y * log(x)
+fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
+function fjac(x, y)
+    x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
+end
+
+const xscalar = rand()
+const yscalar = rand()
+
+const xvec = rand(5)
+const yvec = rand(5)
 
 @testset "AbstractDifferentiation.jl" begin
-    # Write your tests here.
+    @testset "FiniteDifferences" begin
+        @testset "Derivative" begin
+            der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+            der2 = (
+                fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+                fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+            )
+            @test norm.(der1 .- der2) == (0, 0)
+            valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
+            @test valscalar == fder(xscalar, yscalar)
+            @test der3 .- der1 == (0, 0)
+        end
+        @testset "Gradient" begin
+            grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+            grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
+            @test norm.(grad1 .- grad2) == (0, 0)
+            valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
+            @test valscalar == fgrad(xvec, yvec)
+            @test norm.(grad3 .- grad1) == (0, 0)
+        end
+        @testset "Jacobian" begin
+            jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+            jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
+            @test norm.(jac1 .- jac2) == (0, 0)
+            valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
+            @test valvec == fjac(xvec, yvec)
+            @test norm.(jac3 .- jac1) == (0, 0)
+        end
+        @testset "jvp" begin
+            v = (rand(length(xvec)), rand(length(yvec)))
+            pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+            pf2 = (
+                FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
+                FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
+            )
+            @test norm.(pf1 .- pf2) == (0, 0)
+            valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+            @test valvec == fjac(xvec, yvec)
+            @test norm.(pf3 .- pf1) == (0, 0)
+        end
+        @testset "j′vp" begin
+            w = rand(length(fjac(xvec, yvec)))
+            pb1 = AD.pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+            pb2 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
+            @test all(norm.(pb1 .- pb2) .<= (1e-10, 1e-10))
+            valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+            @test valvec == fjac(xvec, yvec)
+            @test norm.(pb3 .- pb1) == (0, 0)
+        end
+    end
 end

From dfc5911d60fcb017f9050fb77b7b41f2059359cf Mon Sep 17 00:00:00 2001
From: Mohamed Tarek <mohamed82008@gmail.com>
Date: Tue, 9 Feb 2021 03:42:15 +1100
Subject: [PATCH 02/25] fix typo

---
 src/AbstractDifferentiation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index ad539ca..e119261 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -96,7 +96,7 @@ function value_gradient_and_hessian(ab::AbstractBackend, f, xs...)
         v, g = value_and_gradient(lowest(ab), f, _xs...)
         if !primalcalled
             value = primalvalue(secondlowest(ab), v, f, xs)
-            primalcalled = false
+            primalcalled = true
         end
         return g
     end, xs...)

From da5aecefaedaeefa65ecf4c742aa738c80ef0970 Mon Sep 17 00:00:00 2001
From: Mohamed Tarek <mohamed82008@gmail.com>
Date: Thu, 11 Feb 2021 04:01:12 +1100
Subject: [PATCH 03/25] primitive macro

---
 Project.toml                   |   3 +-
 src/AbstractDifferentiation.jl | 168 ++++++++++++++++++++++++-------
 test/runtests.jl               | 174 +++++++++++++++++++++++++--------
 3 files changed, 266 insertions(+), 79 deletions(-)

diff --git a/Project.toml b/Project.toml
index 95fb5a4..8efed07 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,14 +4,15 @@ authors = ["Mohamed Tarek <mohamed82008@gmail.com> and contributors"]
 version = "0.1.0"
 
 [deps]
+ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
 julia = "1"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "FiniteDifferences"]
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index e119261..6992955 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -1,6 +1,6 @@
 module AbstractDifferentiation
 
-using LinearAlgebra
+using LinearAlgebra, ExprTools
 
 export AD
 
@@ -20,6 +20,7 @@ function reduceorder(b::HigherOrderBackend)
 end
 lowest(b::AbstractBackend) = b
 lowest(b::HigherOrderBackend) = b.backends[end]
+secondlowest(b::AbstractBackend) = b
 secondlowest(b::HigherOrderBackend) = lowest(reduceorder(b))
 
 # If the primal value is in y, extract it.
@@ -119,7 +120,7 @@ end
 function pushforward_function(
     ab::AbstractBackend,
     f,
-    xs::Union{Number, AbstractArray{<:Number}}...,
+    xs...,
 )
     return (ds) -> begin
         return jacobian(lowest(ab), (xds...,) -> begin
@@ -138,21 +139,21 @@ end
 function value_and_pushforward_function(
     ab::AbstractBackend,
     f,
-    xs::Union{Number, AbstractArray{<:Number}}...,
+    xs...,
 )
     return (ds) -> begin
         @assert ds isa Tuple && length(ds) == length(xs)
-        return value_and_jacobian(lowest(ab), (xds...,) -> begin
-            if ds isa Tuple
-                @assert length(xs) == length(ds)
-                newxs = xs .+ ds .* xds
-                return f(newxs...)
-            else
-                @assert length(xs) == length(xds) == 1
-                newx = xs[1] + ds * xds[1]
-                return f(newx)
+        local value
+        primalcalled = false
+        pf = pushforward_function(lowest(ab), (_xs...,) -> begin
+            vs = f(_xs...)
+            if !primalcalled
+                value = primalvalue(lowest(ab), vs, f, xs)
+                primalcalled = true
             end
-        end, _zero.(xs, ds)...)
+            return vs
+        end, xs...)(ds)
+        return value, pf
     end
 end
 
@@ -198,33 +199,23 @@ function value_and_pullback_function(
     return (ws) -> begin
         local value
         primalcalled = false
-        jacs = jacobian(lowest(ab), (_xs...,) -> begin
-            vs = f(_xs...)
+        if ws === nothing
+            vs = f(xs...)
             if !primalcalled
                 value = primalvalue(lowest(ab), vs, f, xs)
                 primalcalled = true
             end
-            if ws isa Tuple
-                @assert length(vs) == length(ws)
-                return sum(zip(vs, ws)) do v, w
-                    if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
-                        return w' * v
-                    else
-                        # for arbitrary arrays
-                        return dot(w, v)
-                    end
-                end
-            else
-                w, v = ws, vs
-                if w isa Union{AbstractMatrix, UniformScaling} && v isa AbstractVector
-                    return w' * v
-                else
-                    # for arbitrary arrays
-                    return dot(w, v)
-                end
+            return value, nothing
+        end
+        pb = pullback_function(lowest(ab), (_xs...,) -> begin
+            vs = f(_xs...)
+            if !primalcalled
+                value = primalvalue(lowest(ab), vs, f, xs)
+                primalcalled = true
             end
-        end, xs...)
-        return value, adjoint.(jacs)
+            return vs
+        end, xs...)(ws)
+        return value, pb
     end
 end
 
@@ -329,4 +320,111 @@ function (h::H)(xs...; lazy = true)
     end
 end
 
+macro primitive(expr)
+    fdef = ExprTools.splitdef(expr)
+    name = fdef[:name]
+    if name == :pushforward_function
+        return define_pushforward_function_and_friends(fdef) |> esc
+    elseif name == :pullback_function
+        return define_pullback_function_and_friends(fdef) |> esc
+    elseif name == :jacobian
+        return define_jacobian_and_friends(fdef) |> esc
+    elseif name == :primalvalue
+        return define_primalvalue(fdef) |> esc
+    else
+        throw("Unsupported AD primitive.")
+    end
+end
+
+function define_pushforward_function_and_friends(fdef)
+    fdef[:name] = :(AbstractDifferentiation.pushforward_function)
+    args = fdef[:args]
+    funcs = quote
+        $(ExprTools.combinedef(fdef))
+        function AbstractDifferentiation.jacobian($(args...),)
+            identity_like = AbstractDifferentiation.identity_matrix_like($(args[3:end]...),)
+            pff = AbstractDifferentiation.pushforward_function($(args...),)
+            if eltype(identity_like) <: Tuple{Vararg{Union{AbstractMatrix, Number}}}
+                return map(identity_like) do identity_like_i
+                    return mapreduce(hcat, AbstractDifferentiation._eachcol.(identity_like_i)...) do (cols...)
+                        pff(cols)
+                    end
+                end
+            else
+                return pff(identity_like)
+            end
+        end
+    end
+    return funcs
+end
+
+function define_pullback_function_and_friends(fdef)
+    fdef[:name] = :(AbstractDifferentiation.pullback_function)
+    args = fdef[:args]
+    funcs = quote
+        $(ExprTools.combinedef(fdef))
+        function AbstractDifferentiation.jacobian($(args...),)
+            value_and_pbf = AbstractDifferentiation.value_and_pullback_function($(args...),)
+            value, _ = value_and_pbf(nothing)
+            identity_like = AbstractDifferentiation.identity_matrix_like(value)
+            if eltype(identity_like) <: Tuple{Vararg{AbstractMatrix}}
+                return map(identity_like) do identity_like_i
+                    return mapreduce(vcat, AbstractDifferentiation._eachcol.(identity_like_i)...) do (cols...)
+                        value_and_pbf(cols)[2]'
+                    end
+                end
+            else
+                return adjoint.(value_and_pbf(identity_like)[2])
+            end
+        end
+    end
+    return funcs
+end
+
+_eachcol(a::Number) = (a,)
+_eachcol(a) = eachcol(a)
+
+function define_jacobian_and_friends(fdef)
+    fdef[:name] = :(AbstractDifferentiation.jacobian)
+    return ExprTools.combinedef(fdef)
+end
+
+function define_primalvalue(fdef)
+    fdef[:name] = :(AbstractDifferentiation.primalvalue)
+    return ExprTools.combinedef(fdef)
+end
+
+function identity_matrix_like(x)
+    throw("The function `identity_matrix_like` is not defined for the type $(typeof(x)).")
+end
+function identity_matrix_like(x::AbstractVector)
+    return (Matrix{eltype(x)}(I, length(x), length(x)),)
+end
+function identity_matrix_like(x::Number)
+    return (one(x),)
+end
+identity_matrix_like(x::Tuple) = identity_matrix_like(x...)
+@generated function identity_matrix_like(x...)
+    expr = :(())
+    for i in 1:length(x)
+        push!(expr.args, :(()))
+        for j in 1:i-1
+            push!(expr.args[i].args, :((zero_matrix_like(x[$j])[1])))
+        end
+        push!(expr.args[i].args, :((identity_matrix_like(x[$i]))[1]))
+        for j in i+1:length(x)
+            push!(expr.args[i].args, :(zero_matrix_like(x[$j])[1]))
+        end
+    end
+    return expr
+end
+
+zero_matrix_like(x::Tuple) = zero_matrix_like(x...)
+zero_matrix_like(x...) = map(zero_matrix_like, x)
+zero_matrix_like(x::AbstractVector) = (zero(similar(x, length(x), length(x))),)
+zero_matrix_like(x::Number) = (zero(x),)
+function zero_matrix_like(x)
+    throw("The function `zero_matrix_like` is not defined for the type $(typeof(x)).")
+end
+
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index d7015ea..324a265 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,17 +2,39 @@ using AbstractDifferentiation
 using Test, FiniteDifferences, LinearAlgebra
 
 const FDM = FiniteDifferences
-struct FDMBackend{A} <: AD.AbstractFiniteDifference
+
+struct FDMBackend1{A} <: AD.AbstractFiniteDifference
     alg::A
 end
-FDMBackend() = FDMBackend(central_fdm(5, 1))
-const fdm_backend = FDMBackend()
-
+FDMBackend1() = FDMBackend1(central_fdm(5, 1))
+const fdm_backend1 = FDMBackend1()
 # Minimal interface
-function AD.jacobian(ab::FDMBackend, f, xs...)
+AD.@primitive function jacobian(ab::FDMBackend1, f, xs...)
     return jacobian(ab.alg, f, xs...)
 end
 
+struct FDMBackend2{A} <: AD.AbstractFiniteDifference
+    alg::A
+end
+FDMBackend2() = FDMBackend2(central_fdm(5, 1))
+const fdm_backend2 = FDMBackend2()
+AD.@primitive function pushforward_function(ab::FDMBackend2, f, xs...)
+    return (vs) -> jvp(ab.alg, f, tuple.(xs, vs)...)
+end
+
+struct FDMBackend3{A} <: AD.AbstractFiniteDifference
+    alg::A
+end
+FDMBackend3() = FDMBackend3(central_fdm(5, 1))
+const fdm_backend3 = FDMBackend3()
+AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
+    return function (vs)
+        # Supports only single output
+        @assert length(vs) == 1
+        return j′vp(ab.alg, f, vs[1], xs...)
+    end
+end
+
 fder(x, y) = exp(y) * x + y * log(x)
 fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
 function fjac(x, y)
@@ -25,55 +47,121 @@ const yscalar = rand()
 const xvec = rand(5)
 const yvec = rand(5)
 
+function test_fdm_derivatives(fdm_backend)
+    der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+    der2 = (
+        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+    )
+    @test norm.(der1 .- der2) == (0, 0)
+    valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
+    @test valscalar == fder(xscalar, yscalar)
+    @test der3 .- der1 == (0, 0)
+end
+
+function test_fdm_gradients(fdm_backend)
+    grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+    grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
+    @test norm.(grad1 .- grad2) == (0, 0)
+    valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
+    @test valscalar == fgrad(xvec, yvec)
+    @test norm.(grad3 .- grad1) == (0, 0)
+end
+
+function test_fdm_jacobians(fdm_backend)
+    jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+    jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
+    @test norm.(jac1 .- jac2) == (0, 0)
+    valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
+    @test valvec == fjac(xvec, yvec)
+    @test norm.(jac3 .- jac1) == (0, 0)
+end
+
+function test_fdm_hessians(fdm_backend)
+    fhess = x -> fgrad(x, yvec)
+    hess1 = AD.hessian(fdm_backend, fhess, xvec)
+    hess2 = FDM.jacobian(
+        fdm_backend.alg,
+        (x) -> begin
+            FDM.grad(
+                fdm_backend.alg,
+                fhess,
+                x,
+            )
+        end,
+        xvec,
+    )
+    @test norm.(hess1 .- hess2) == (0,)
+    valscalar, hess3 = AD.value_and_hessian(fdm_backend, fhess, xvec)
+    @test valscalar == fgrad(xvec, yvec)
+    @test norm.(hess3 .- hess1) == (0,)
+    valscalar, grad, hess4 = AD.value_gradient_and_hessian(fdm_backend, fhess, xvec)
+    @test valscalar == fgrad(xvec, yvec)
+    @test norm.(grad .- AD.gradient(fdm_backend, fhess, xvec)) == (0,)
+    @test norm.(hess4 .- hess1) == (0,)
+end
+
+function test_fdm_jvp(fdm_backend)
+    v = (rand(length(xvec)), rand(length(yvec)))
+    pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+    pf2 = (
+        FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
+        FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
+    )
+    @test norm.(pf1 .- pf2) == (0, 0)
+    valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+    @test valvec == fjac(xvec, yvec)
+    @test norm.(pf3 .- pf1) == (0, 0)
+end
+
+function test_fdm_j′vp(fdm_backend)
+    w = rand(length(fjac(xvec, yvec)))
+    pb1 = AD.pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+    pb2 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
+    @test all(norm.(pb1 .- pb2) .<= (1e-10, 1e-10))
+    valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+    @test valvec == fjac(xvec, yvec)
+    @test norm.(pb3 .- pb1) == (0, 0)
+end
+
 @testset "AbstractDifferentiation.jl" begin
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
-            der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
-            der2 = (
-                fdm_backend.alg(x -> fder(x, yscalar), xscalar),
-                fdm_backend.alg(y -> fder(xscalar, y), yscalar),
-            )
-            @test norm.(der1 .- der2) == (0, 0)
-            valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
-            @test valscalar == fder(xscalar, yscalar)
-            @test der3 .- der1 == (0, 0)
+            test_fdm_derivatives(fdm_backend1)
+            test_fdm_derivatives(fdm_backend2)
+            test_fdm_derivatives(fdm_backend3)
         end
         @testset "Gradient" begin
-            grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
-            grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
-            @test norm.(grad1 .- grad2) == (0, 0)
-            valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
-            @test valscalar == fgrad(xvec, yvec)
-            @test norm.(grad3 .- grad1) == (0, 0)
+            test_fdm_gradients(fdm_backend1)
+            test_fdm_gradients(fdm_backend2)
+            test_fdm_gradients(fdm_backend3)
         end
         @testset "Jacobian" begin
-            jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
-            jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
-            @test norm.(jac1 .- jac2) == (0, 0)
-            valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
-            @test valvec == fjac(xvec, yvec)
-            @test norm.(jac3 .- jac1) == (0, 0)
+            test_fdm_jacobians(fdm_backend1)
+            test_fdm_jacobians(fdm_backend2)
+            # Errors
+            test_fdm_jacobians(fdm_backend3)
+        end
+        @testset "Hessian" begin
+            # Works but super slow
+            test_fdm_hessians(fdm_backend1)
+            # Errors
+            test_fdm_hessians(fdm_backend2)
+            # Errors
+            test_fdm_hessians(fdm_backend3)
         end
         @testset "jvp" begin
-            v = (rand(length(xvec)), rand(length(yvec)))
-            pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-            pf2 = (
-                FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
-                FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
-            )
-            @test norm.(pf1 .- pf2) == (0, 0)
-            valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-            @test valvec == fjac(xvec, yvec)
-            @test norm.(pf3 .- pf1) == (0, 0)
+            test_fdm_jvp(fdm_backend1)
+            # Errors
+            test_fdm_jvp(fdm_backend2)
+            # Errors
+            test_fdm_jvp(fdm_backend3)
         end
         @testset "j′vp" begin
-            w = rand(length(fjac(xvec, yvec)))
-            pb1 = AD.pullback_function(fdm_backend, fjac, xvec, yvec)(w)
-            pb2 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
-            @test all(norm.(pb1 .- pb2) .<= (1e-10, 1e-10))
-            valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
-            @test valvec == fjac(xvec, yvec)
-            @test norm.(pb3 .- pb1) == (0, 0)
+            test_fdm_j′vp(fdm_backend1)
+            test_fdm_j′vp(fdm_backend2)
+            # Errors
+            test_fdm_j′vp(fdm_backend3)
         end
     end
 end

From 512fd2d5691ce10a530c4df64c0ed258dd27b1f1 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Mon, 3 May 2021 11:29:20 +0200
Subject: [PATCH 04/25] fix gradients

---
 src/AbstractDifferentiation.jl | 11 ++++++++++-
 test/runtests.jl               | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 6992955..847b4b7 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -54,6 +54,10 @@ end
 function value_and_jacobian(ab::AbstractBackend, f, xs...)
     local value
     primalcalled = false
+    if ab isa AbstractFiniteDifference
+        value = primalvalue(ab, nothing, f, xs)
+        primalcalled = true
+    end
     jacs = jacobian(lowest(ab), (_xs...,) -> begin
         v = f(_xs...)
         if !primalcalled
@@ -62,11 +66,16 @@ function value_and_jacobian(ab::AbstractBackend, f, xs...)
         end
         return v
     end, xs...)
+
     return value, jacs
 end
 function value_and_hessian(ab::AbstractBackend, f, xs...)
     local value
     primalcalled = false
+    if ab isa AbstractFiniteDifference
+        value = primalvalue(ab, nothing, f, xs)
+        primalcalled = true
+    end
     hess = jacobian(secondlowest(ab), (_xs...,) -> begin
         v, g = value_and_gradient(lowest(ab), f, _xs...)
         if !primalcalled
@@ -427,4 +436,4 @@ function zero_matrix_like(x)
     throw("The function `zero_matrix_like` is not defined for the type $(typeof(x)).")
 end
 
-end
\ No newline at end of file
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 324a265..331ff9a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -36,10 +36,18 @@ AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
 end
 
 fder(x, y) = exp(y) * x + y * log(x)
+dfderdx(x, y) = exp(y) + y * 1/x
+dfderdy(x, y) = exp(y) * x + log(x)
+
 fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
+dfgraddx(x, y) = prod(x)./x
+dfgraddy(x, y) = one(eltype(y)) ./ (1:length(y))
+
 function fjac(x, y)
     x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
 end
+dfjacdx(x, y) = I(length(x))
+dfjacdy(x, y) = Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U)
 
 const xscalar = rand()
 const yscalar = rand()
@@ -47,6 +55,10 @@ const yscalar = rand()
 const xvec = rand(5)
 const yvec = rand(5)
 
+# to check if vectors get mutated
+xvec2 = deepcopy(xvec)
+yvec2 = deepcopy(yvec)
+
 function test_fdm_derivatives(fdm_backend)
     der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
     der2 = (
@@ -57,6 +69,8 @@ function test_fdm_derivatives(fdm_backend)
     valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
     @test valscalar == fder(xscalar, yscalar)
     @test der3 .- der1 == (0, 0)
+    der_exact = (dfderdx(xscalar,yscalar), dfderdy(xscalar,yscalar))
+    @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
 end
 
 function test_fdm_gradients(fdm_backend)
@@ -66,6 +80,10 @@ function test_fdm_gradients(fdm_backend)
     valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad3 .- grad1) == (0, 0)
+    grad_exact = (dfgraddx(xvec,yvec), dfgraddy(xvec,yvec))
+    @test minimum(isapprox.(grad_exact, grad1, rtol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 function test_fdm_jacobians(fdm_backend)
@@ -75,6 +93,10 @@ function test_fdm_jacobians(fdm_backend)
     valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
     @test valvec == fjac(xvec, yvec)
     @test norm.(jac3 .- jac1) == (0, 0)
+    grad_exact = (dfjacdx(xvec, yvec), dfjacdy(xvec, yvec))
+    @test minimum(isapprox.(grad_exact, jac1, rtol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 function test_fdm_hessians(fdm_backend)

From caa72e1e6a7a33c8271bf4067b7b41ba51add397 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Mon, 3 May 2021 21:55:51 +0200
Subject: [PATCH 05/25] fix Jacobian tests

---
 src/AbstractDifferentiation.jl | 4 ++++
 test/runtests.jl               | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 847b4b7..3fd0f36 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -382,6 +382,10 @@ function define_pullback_function_and_friends(fdef)
                         value_and_pbf(cols)[2]'
                     end
                 end
+            elseif eltype(identity_like) <: AbstractMatrix
+                return vcat.(mapslices(identity_like[1], dims=1) do cols
+                    adjoint.(value_and_pbf((cols,))[2])
+                end ...)
             else
                 return adjoint.(value_and_pbf(identity_like)[2])
             end
diff --git a/test/runtests.jl b/test/runtests.jl
index 331ff9a..2f96c07 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -161,7 +161,6 @@ end
         @testset "Jacobian" begin
             test_fdm_jacobians(fdm_backend1)
             test_fdm_jacobians(fdm_backend2)
-            # Errors
             test_fdm_jacobians(fdm_backend3)
         end
         @testset "Hessian" begin

From 4a763e3693fdf9e14c25dde454121730e45a2158 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 5 May 2021 15:01:18 +0200
Subject: [PATCH 06/25] fix hessian tests

---
 src/AbstractDifferentiation.jl | 13 +++++++++++++
 test/runtests.jl               | 10 +++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 3fd0f36..0610049 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -359,6 +359,19 @@ function define_pushforward_function_and_friends(fdef)
                         pff(cols)
                     end
                 end
+            elseif eltype(identity_like) <: AbstractMatrix
+                ret = hcat.(mapslices(identity_like[1], dims=1) do cols
+                    pf = pff((cols,))
+                    if typeof(pf) <: AbstractVector
+                        return (pf, )
+                    elseif typeof(pf) <: AbstractMatrix
+                        return (transpose(pf), )
+                    else
+                        return pf
+                    end
+                end ...)
+                return ret isa Tuple ? ret : (ret,)
+
             else
                 return pff(identity_like)
             end
diff --git a/test/runtests.jl b/test/runtests.jl
index 2f96c07..88b8590 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,6 +42,7 @@ dfderdy(x, y) = exp(y) * x + log(x)
 fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
 dfgraddx(x, y) = prod(x)./x
 dfgraddy(x, y) = one(eltype(y)) ./ (1:length(y))
+dfgraddxdx(x, y) = prod(x)./(x*x') - Diagonal(diag(prod(x)./(x*x')))
 
 function fjac(x, y)
     x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
@@ -121,6 +122,12 @@ function test_fdm_hessians(fdm_backend)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad .- AD.gradient(fdm_backend, fhess, xvec)) == (0,)
     @test norm.(hess4 .- hess1) == (0,)
+    @test dfgraddxdx(xvec,yvec) ≈ hess1[1] atol=1e-10
+    @test xvec == xvec2
+    @test yvec == yvec2
+    fhess2 = x-> dfgraddx(x, yvec)
+    hess5 = AD.jacobian(fdm_backend, fhess2, xvec)
+    @test minimum(isapprox.(hess5, hess1, atol=1e-10))
 end
 
 function test_fdm_jvp(fdm_backend)
@@ -166,16 +173,13 @@ end
         @testset "Hessian" begin
             # Works but super slow
             test_fdm_hessians(fdm_backend1)
-            # Errors
             test_fdm_hessians(fdm_backend2)
-            # Errors
             test_fdm_hessians(fdm_backend3)
         end
         @testset "jvp" begin
             test_fdm_jvp(fdm_backend1)
             # Errors
             test_fdm_jvp(fdm_backend2)
-            # Errors
             test_fdm_jvp(fdm_backend3)
         end
         @testset "j′vp" begin

From b6f48045fe9c4babf4efdbf9b2f977bf9cd51d98 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 5 May 2021 16:24:50 +0200
Subject: [PATCH 07/25] =?UTF-8?q?fix=20=20j=E2=80=B2vp?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Project.toml                   |  3 ++-
 src/AbstractDifferentiation.jl |  8 ++++++++
 test/runtests.jl               | 23 ++++++++++++++++++++---
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8efed07..38eefd4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,8 @@ julia = "1"
 
 [extras]
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "FiniteDifferences"]
+test = ["Test", "FiniteDifferences", "Random"]
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 0610049..bc2ffd6 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -154,6 +154,10 @@ function value_and_pushforward_function(
         @assert ds isa Tuple && length(ds) == length(xs)
         local value
         primalcalled = false
+        if ab isa AbstractFiniteDifference
+            value = primalvalue(ab, nothing, f, xs)
+            primalcalled = true
+        end
         pf = pushforward_function(lowest(ab), (_xs...,) -> begin
             vs = f(_xs...)
             if !primalcalled
@@ -208,6 +212,10 @@ function value_and_pullback_function(
     return (ws) -> begin
         local value
         primalcalled = false
+        if ab isa AbstractFiniteDifference
+            value = primalvalue(ab, nothing, f, xs)
+            primalcalled = true
+        end
         if ws === nothing
             vs = f(xs...)
             if !primalcalled
diff --git a/test/runtests.jl b/test/runtests.jl
index 88b8590..d863686 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,7 @@
 using AbstractDifferentiation
 using Test, FiniteDifferences, LinearAlgebra
+using Random
+Random.seed!(1234)
 
 const FDM = FiniteDifferences
 
@@ -30,8 +32,13 @@ const fdm_backend3 = FDMBackend3()
 AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
     return function (vs)
         # Supports only single output
-        @assert length(vs) == 1
-        return j′vp(ab.alg, f, vs[1], xs...)
+        if vs isa AbstractVector
+            return j′vp(ab.alg, f, vs, xs...)
+        else
+            @assert length(vs) == 1
+            return j′vp(ab.alg, f, vs[1], xs...)
+
+        end
     end
 end
 
@@ -50,6 +57,14 @@ end
 dfjacdx(x, y) = I(length(x))
 dfjacdy(x, y) = Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U)
 
+# Jvp
+jxvp(x,y,v) = dfjacdx(x,y)*v
+jyvp(x,y,v) = dfjacdy(x,y)*v
+
+# vJp
+vJxp(x,y,v) = dfjacdx(x,y)'*v
+vJyp(x,y,v) = dfjacdy(x,y)'*v
+
 const xscalar = rand()
 const yscalar = rand()
 
@@ -151,6 +166,9 @@ function test_fdm_j′vp(fdm_backend)
     valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
     @test valvec == fjac(xvec, yvec)
     @test norm.(pb3 .- pb1) == (0, 0)
+    @test minimum(isapprox.(pb1, (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 @testset "AbstractDifferentiation.jl" begin
@@ -185,7 +203,6 @@ end
         @testset "j′vp" begin
             test_fdm_j′vp(fdm_backend1)
             test_fdm_j′vp(fdm_backend2)
-            # Errors
             test_fdm_j′vp(fdm_backend3)
         end
     end

From 5ed5c4ef256eee0ba7e50400ab708f1f37f833db Mon Sep 17 00:00:00 2001
From: frankschae <42201748+frankschae@users.noreply.github.com>
Date: Sun, 9 May 2021 17:37:51 +0200
Subject: [PATCH 08/25] Update src/AbstractDifferentiation.jl

Co-authored-by: Mohamed Tarek <mohamed82008@gmail.com>
---
 src/AbstractDifferentiation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index bc2ffd6..e148f67 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -54,7 +54,7 @@ end
 function value_and_jacobian(ab::AbstractBackend, f, xs...)
     local value
     primalcalled = false
-    if ab isa AbstractFiniteDifference
+    if lowest(ab) isa AbstractFiniteDifference
         value = primalvalue(ab, nothing, f, xs)
         primalcalled = true
     end

From 2991b8a5f7854664212d90c590e7798816b767aa Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 11 May 2021 22:24:30 +0200
Subject: [PATCH 09/25] fix vjp tests

---
 test/runtests.jl | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index d863686..c9dba17 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -147,15 +147,26 @@ end
 
 function test_fdm_jvp(fdm_backend)
     v = (rand(length(xvec)), rand(length(yvec)))
-    pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+
+    # augmented version of v
+    identity_like = AD.identity_matrix_like(v)
+    vaug = map(identity_like) do identity_like_i
+        identity_like_i .* v
+    end
+
+    pf1 = map(v->AD.pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
     pf2 = (
         FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
     @test norm.(pf1 .- pf2) == (0, 0)
-    valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-    @test valvec == fjac(xvec, yvec)
-    @test norm.(pf3 .- pf1) == (0, 0)
+    ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
+    @test valvec1 == fjac(xvec, yvec)
+    @test valvec2 == fjac(xvec, yvec)
+    @test norm.((pf3x,pf3y) .- pf1) == (0, 0)
+    @test minimum(isapprox.(pf1, (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 function test_fdm_j′vp(fdm_backend)
@@ -196,7 +207,6 @@ end
         end
         @testset "jvp" begin
             test_fdm_jvp(fdm_backend1)
-            # Errors
             test_fdm_jvp(fdm_backend2)
             test_fdm_jvp(fdm_backend3)
         end

From ddf2863ba89c7809b50ff8da21136b16ba3db913 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 11 May 2021 22:24:30 +0200
Subject: [PATCH 10/25] fix jvp tests

---
 test/runtests.jl | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index d863686..c9dba17 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -147,15 +147,26 @@ end
 
 function test_fdm_jvp(fdm_backend)
     v = (rand(length(xvec)), rand(length(yvec)))
-    pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+
+    # augmented version of v
+    identity_like = AD.identity_matrix_like(v)
+    vaug = map(identity_like) do identity_like_i
+        identity_like_i .* v
+    end
+
+    pf1 = map(v->AD.pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
     pf2 = (
         FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
     @test norm.(pf1 .- pf2) == (0, 0)
-    valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-    @test valvec == fjac(xvec, yvec)
-    @test norm.(pf3 .- pf1) == (0, 0)
+    ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
+    @test valvec1 == fjac(xvec, yvec)
+    @test valvec2 == fjac(xvec, yvec)
+    @test norm.((pf3x,pf3y) .- pf1) == (0, 0)
+    @test minimum(isapprox.(pf1, (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
 function test_fdm_j′vp(fdm_backend)
@@ -196,7 +207,6 @@ end
         end
         @testset "jvp" begin
             test_fdm_jvp(fdm_backend1)
-            # Errors
             test_fdm_jvp(fdm_backend2)
             test_fdm_jvp(fdm_backend3)
         end

From 959fa013026b59e9c3fa1c622d366e376f7bdc8f Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Fri, 21 May 2021 16:15:27 +0200
Subject: [PATCH 11/25] lazy derivative fixes

---
 src/AbstractDifferentiation.jl | 31 ++++++++++++--
 test/runtests.jl               | 75 ++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index e148f67..0cd44fe 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -31,8 +31,14 @@ primalvalue(x::Tuple) = map(primalvalue, x)
 primalvalue(x) = x
 
 function derivative(ab::AbstractBackend, f, xs::Number...)
-    return getindex.(jacobian(lowest(ab), f, xs...), 1)
+    der = getindex.(jacobian(lowest(ab), f, xs...), 1)
+    if der isa Tuple
+        return der
+    else
+        return (der,)
+    end
 end
+
 function gradient(ab::AbstractBackend, f, xs...)
     return adjoint.(jacobian(lowest(ab), f, xs...))
 end
@@ -241,13 +247,32 @@ struct LazyDerivative{B, F, X}
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyDerivative, y)
-    return derivative(d.ab, d.f, d.xs...) * y
+    return derivative(d.backend, d.f, d.xs...) * y
 end
+
 function Base.:*(y, d::LazyDerivative)
-    return y * derivative(d.ab, d.f, d.xs...)
+    return y * derivative(d.backend, d.f, d.xs...)
+end
+
+function Base.:*(d::LazyDerivative, y::Union{Number,Tuple})
+    return derivative(d.backend, d.f, d.xs...) .* y
 end
 
+function Base.:*(y::Union{Number,Tuple}, d::LazyDerivative)
+    return y .* derivative(d.backend, d.f, d.xs...)
+end
+
+function Base.:*(d::LazyDerivative, y::AbstractArray)
+    return map((d)-> d*y, derivative(d.backend, d.f, d.xs...))
+end
+
+function Base.:*(y::AbstractArray, d::LazyDerivative)
+    return map((d)-> y*d, derivative(d.backend, d.f, d.xs...))
+end
+
+
 struct LazyGradient{B, F, X}
     backend::B
     f::F
diff --git a/test/runtests.jl b/test/runtests.jl
index c9dba17..843dde4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -182,6 +182,76 @@ function test_fdm_j′vp(fdm_backend)
     @test yvec == yvec2
 end
 
+function test_fdm_lazy_derivatives(fdm_backend)
+    # single input function
+    der1 = AD.derivative(fdm_backend, x->fder(x, yscalar), xscalar)
+    der2 = (
+        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+    )
+
+    lazyder = AD.LazyDerivative(fdm_backend, x->fder(x, yscalar), xscalar)
+
+    # multiplication with scalar
+    @test der1[1]*yscalar == der2[1]*yscalar
+    @test lazyder*yscalar == der1.*yscalar
+    @test lazyder*yscalar isa Tuple
+
+    @test yscalar*der1[1] == yscalar*der2[1]
+    @test yscalar*lazyder == yscalar.*der1
+    @test yscalar*lazyder isa Tuple
+
+    # multiplication with array
+    @test der1[1]*yvec == der2[1]*yvec
+    @test lazyder*yvec == (der1.*yvec,)
+    @test lazyder*yvec isa Tuple
+
+    @test yvec*der1[1] == yvec*der2[1]
+    @test yvec*lazyder == (yvec.*der1,)
+    @test yvec*lazyder isa Tuple
+
+    # multiplication with tuple
+    @test lazyder*(yscalar,) == lazyder*yscalar
+    @test lazyder*(yvec,) == lazyder*yvec
+
+    @test (yscalar,)*lazyder == yscalar*lazyder
+    @test (yvec,)*lazyder == yvec*lazyder
+
+    # two input function
+    der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+    der2 = (
+        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
+        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
+    )
+
+    lazyder = AD.LazyDerivative(fdm_backend, fder, (xscalar, yscalar))
+
+    # multiplication with scalar
+    @test der1.*yscalar == der2.*yscalar
+    @test lazyder*yscalar == der1.*yscalar
+    @test lazyder*yscalar isa Tuple
+
+    @test yscalar.*der1 == yscalar.*der2
+    @test yscalar*lazyder == yscalar.*der1
+    @test yscalar*lazyder isa Tuple
+
+    # multiplication with array
+    @test (der1[1]*yvec, der1[2]*yvec) == (der2[1]*yvec, der2[2]*yvec)
+    @test lazyder*yvec == (der1[1]*yvec, der1[2]*yvec)
+    @test lazyder*yvec isa Tuple
+
+    @test (yvec*der1[1], yvec*der1[2]) == (yvec*der2[1], yvec*der2[2])
+    @test yvec*lazyder == (yvec*der1[1], yvec*der1[2])
+    @test lazyder*yvec isa Tuple
+
+    # multiplication with tuple
+    @test lazyder*(yscalar,) == lazyder*yscalar
+    @test lazyder*(yvec,) == lazyder*yvec
+
+    @test (yscalar,)*lazyder == yscalar*lazyder
+    @test (yvec,)*lazyder == yvec*lazyder
+end
+
 @testset "AbstractDifferentiation.jl" begin
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
@@ -215,5 +285,10 @@ end
             test_fdm_j′vp(fdm_backend2)
             test_fdm_j′vp(fdm_backend3)
         end
+        @testset "Lazy Derivative" begin
+            test_fdm_lazy_derivatives(fdm_backend1)
+            test_fdm_lazy_derivatives(fdm_backend2)
+            test_fdm_lazy_derivatives(fdm_backend3)
+        end
     end
 end

From 5102f01cfe90e660eafdbe190ae89e867e56f049 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Fri, 21 May 2021 17:19:16 +0200
Subject: [PATCH 12/25] lazy gradient

---
 src/AbstractDifferentiation.jl | 22 +++++++++++++++--
 test/runtests.jl               | 43 ++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 0cd44fe..636a344 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -278,8 +278,26 @@ struct LazyGradient{B, F, X}
     f::F
     xs::X
 end
-Base.:*(d::LazyGradient, y) = gradient(d.ab, d.f, d.xs...) * y
-Base.:*(y, d::LazyGradient) = y * gradient(d.ab, d.f, d.xs...)
+Base.:*(d::LazyGradient, y) = gradient(d.backend, d.f, d.xs...) * y
+Base.:*(y, d::LazyGradient) = y * gradient(d.backend, d.f, d.xs...)
+
+
+function Base.:*(d::LazyGradient, y::Union{Number,Tuple})
+    if d.xs isa Tuple
+        return gradient(d.backend, d.f, d.xs...) .* y
+    else
+        return gradient(d.backend, d.f, d.xs) .* y
+    end
+end
+
+function Base.:*(y::Union{Number,Tuple}, d::LazyGradient)
+    if d.xs isa Tuple
+        return y .* gradient(d.backend, d.f, d.xs...)
+    else
+        return y .* gradient(d.backend, d.f, d.xs)
+    end
+end
+
 
 struct LazyJacobian{B, F, X}
     backend::B
diff --git a/test/runtests.jl b/test/runtests.jl
index 843dde4..ead5e28 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -252,6 +252,44 @@ function test_fdm_lazy_derivatives(fdm_backend)
     @test (yvec,)*lazyder == yvec*lazyder
 end
 
+function test_fdm_lazy_gradients(fdm_backend)
+    # single input function
+    grad1 = AD.gradient(fdm_backend, x->fgrad(x, yvec), xvec)
+    grad2 = FDM.grad(fdm_backend.alg, x->fgrad(x, yvec), xvec)
+    lazygrad = AD.LazyGradient(fdm_backend, x->fgrad(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,)
+    @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,)
+    @test lazygrad*yscalar isa Tuple
+
+    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,)
+    @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,)
+    @test yscalar*lazygrad isa Tuple
+
+    # multiplication with tuple
+    @test lazygrad*(yscalar,) == lazygrad*yscalar
+    @test (yscalar,)*lazygrad == yscalar*lazygrad
+
+    # two input function
+    grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+    grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
+    lazygrad = AD.LazyGradient(fdm_backend, fgrad, (xvec, yvec))
+
+    # multiplication with scalar
+    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,0)
+    @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,0)
+    @test lazygrad*yscalar isa Tuple
+
+    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,0)
+    @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,0)
+    @test yscalar*lazygrad isa Tuple
+
+    # multiplication with tuple
+    @test lazygrad*(yscalar,) == lazygrad*yscalar
+    @test (yscalar,)*lazygrad == yscalar*lazygrad
+end
+
 @testset "AbstractDifferentiation.jl" begin
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
@@ -290,5 +328,10 @@ end
             test_fdm_lazy_derivatives(fdm_backend2)
             test_fdm_lazy_derivatives(fdm_backend3)
         end
+        @testset "Lazy Gradient" begin
+            test_fdm_lazy_gradients(fdm_backend1)
+            test_fdm_lazy_gradients(fdm_backend2)
+            test_fdm_lazy_gradients(fdm_backend3)
+        end
     end
 end

From 6941f5ccfcc6c4726ebbec6dc051bd6c30d7848e Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Sun, 6 Jun 2021 20:24:52 +0200
Subject: [PATCH 13/25] lazy jacobian

---
 src/AbstractDifferentiation.jl |  47 +++++++++++++++-
 test/runtests.jl               | 100 ++++++++++++++++++++++++++++++---
 2 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 636a344..77b1883 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -305,7 +305,16 @@ struct LazyJacobian{B, F, X}
     xs::X
 end
 function Base.:*(d::LazyJacobian, ys)
-    return pushforward_function(d.ab, d.f, d.xs...)(ys)
+    if d.xs isa Tuple
+        jvp = pushforward_function(d.backend, d.f, d.xs...)(ys)
+        if jvp isa Tuple
+            return vec.(jvp)
+        else
+            return vec(jvp)
+        end
+    else
+        return vec.(pushforward_function(d.backend, d.f, d.xs)(ys))
+    end
 end
 function Base.:*(ys, d::LazyJacobian)
     if ys isa Tuple
@@ -313,9 +322,43 @@ function Base.:*(ys, d::LazyJacobian)
     else
         ya = adjoint(ys)
     end
-    return pullback_function(d.ab, d.f, d.xs...)(ya)
+    if d.xs isa Tuple
+        return pullback_function(d.backend, d.f, d.xs...)(ya)
+    else
+        return pullback_function(d.backend, d.f, d.xs)(ya)
+    end
+end
+
+function Base.:*(d::LazyJacobian, ys::Number)
+    if d.xs isa Tuple
+        return jacobian(d.backend, d.f, d.xs...) .* ys
+    else
+        return jacobian(d.backend, d.f, d.xs) .* ys
+    end
 end
 
+function Base.:*(ys::Number, d::LazyJacobian)
+    if d.xs isa Tuple
+        return jacobian(d.backend, d.f, d.xs...) .* ys
+    else
+        return jacobian(d.backend, d.f, d.xs) .* ys
+    end
+end
+
+function Base.:*(d::LazyJacobian, ys::AbstractArray)
+    if d.xs isa Tuple
+        return vec.(pushforward_function(d.backend, d.f, d.xs...)((ys,)))
+    else
+        vjp = (pushforward_function(d.backend, d.f, d.xs)((ys,)))
+        if vjp isa Tuple
+            return vec.(vjp)
+        else
+            return (vec(vjp),)
+        end
+    end
+end
+
+
 struct LazyHessian{B, F, X}
     backend::B
     f::F
diff --git a/test/runtests.jl b/test/runtests.jl
index ead5e28..9247eb2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -21,7 +21,9 @@ end
 FDMBackend2() = FDMBackend2(central_fdm(5, 1))
 const fdm_backend2 = FDMBackend2()
 AD.@primitive function pushforward_function(ab::FDMBackend2, f, xs...)
-    return (vs) -> jvp(ab.alg, f, tuple.(xs, vs)...)
+    return function (vs)
+        jvp(ab.alg, f, tuple.(xs, vs)...)
+    end
 end
 
 struct FDMBackend3{A} <: AD.AbstractFiniteDifference
@@ -148,19 +150,25 @@ end
 function test_fdm_jvp(fdm_backend)
     v = (rand(length(xvec)), rand(length(yvec)))
 
-    # augmented version of v
-    identity_like = AD.identity_matrix_like(v)
-    vaug = map(identity_like) do identity_like_i
-        identity_like_i .* v
-    end
+    if fdm_backend isa FDMBackend2 # augmented version of v
+        identity_like = AD.identity_matrix_like(v)
+        vaug = map(identity_like) do identity_like_i
+            identity_like_i .* v
+        end
 
-    pf1 = map(v->AD.pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
+        pf1 = map(v->AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
+        ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
+    else
+        pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+        valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+        ((valvec1, pf3x), (valvec2, pf3y)) = (valvec, pf3[1]), (valvec, pf3[2])
+    end
     pf2 = (
         FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
     @test norm.(pf1 .- pf2) == (0, 0)
-    ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend2, fjac, xvec, yvec)(v), vaug)
+
     @test valvec1 == fjac(xvec, yvec)
     @test valvec2 == fjac(xvec, yvec)
     @test norm.((pf3x,pf3y) .- pf1) == (0, 0)
@@ -290,6 +298,77 @@ function test_fdm_lazy_gradients(fdm_backend)
     @test (yscalar,)*lazygrad == yscalar*lazygrad
 end
 
+function test_fdm_lazy_jacobians(fdm_backend)
+    # single input function
+    jac1 = AD.jacobian(fdm_backend, x->fjac(x, yvec), xvec)
+    jac2 = FDM.jacobian(fdm_backend.alg, x->fjac(x, yvec), xvec)
+    lazyjac = AD.LazyJacobian(fdm_backend, x->fjac(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,)
+    @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,)
+    @test lazyjac*yscalar isa Tuple
+
+    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,)
+    @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,)
+    @test yscalar*lazyjac isa Tuple
+
+    w = adjoint(rand(length(fjac(xvec, yvec))))
+    v = (rand(length(xvec)),rand(length(xvec)))
+
+    # vjp
+    pb1 = FDM.j′vp(fdm_backend.alg, x -> fjac(x, yvec), w, xvec)
+    res = w*lazyjac
+    @test minimum(isapprox.(pb1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # jvp
+    pf1 = (FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),)
+    res = lazyjac*v[1]
+    @test minimum(isapprox.(pf1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # two input function
+    jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+    jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
+    lazyjac = AD.LazyJacobian(fdm_backend, fjac, (xvec, yvec))
+
+    # multiplication with scalar
+    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,0)
+    @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,0)
+    @test lazyjac*yscalar isa Tuple
+
+    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,0)
+    @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,0)
+    @test yscalar*lazyjac isa Tuple
+
+    # vjp
+    pb1 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
+    res = w*lazyjac
+    @test minimum(isapprox.(pb1, res, atol=1e-10))
+    @test res isa Tuple
+
+    # jvp
+    pf1 = (
+        FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
+        FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
+    )
+
+    if fdm_backend isa FDMBackend2 # augmented version of v
+        identity_like = AD.identity_matrix_like(v)
+        vaug = map(identity_like) do identity_like_i
+            identity_like_i .* v
+        end
+
+        res = map(v->lazyjac*v, vaug)
+    else
+        res = lazyjac*v
+    end
+
+    @test minimum(isapprox.(pf1, res, atol=1e-10))
+    @test res isa Tuple
+end
+
 @testset "AbstractDifferentiation.jl" begin
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
@@ -333,5 +412,10 @@ end
             test_fdm_lazy_gradients(fdm_backend2)
             test_fdm_lazy_gradients(fdm_backend3)
         end
+        @testset "Lazy Jacobian" begin
+            test_fdm_lazy_jacobians(fdm_backend1)
+            test_fdm_lazy_jacobians(fdm_backend2)
+            test_fdm_lazy_jacobians(fdm_backend3)
+        end
     end
 end

From c744f923cd48780fd123c75359bb230abe164737 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 8 Jun 2021 16:38:08 +0200
Subject: [PATCH 14/25] lazy hessian tests

---
 src/AbstractDifferentiation.jl | 73 ++++++++++++++++++++++++++++++----
 test/runtests.jl               | 35 ++++++++++++++++
 2 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 77b1883..5014052 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -281,7 +281,6 @@ end
 Base.:*(d::LazyGradient, y) = gradient(d.backend, d.f, d.xs...) * y
 Base.:*(y, d::LazyGradient) = y * gradient(d.backend, d.f, d.xs...)
 
-
 function Base.:*(d::LazyGradient, y::Union{Number,Tuple})
     if d.xs isa Tuple
         return gradient(d.backend, d.f, d.xs...) .* y
@@ -304,6 +303,7 @@ struct LazyJacobian{B, F, X}
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyJacobian, ys)
     if d.xs isa Tuple
         jvp = pushforward_function(d.backend, d.f, d.xs...)(ys)
@@ -316,6 +316,7 @@ function Base.:*(d::LazyJacobian, ys)
         return vec.(pushforward_function(d.backend, d.f, d.xs)(ys))
     end
 end
+
 function Base.:*(ys, d::LazyJacobian)
     if ys isa Tuple
         ya = adjoint.(ys)
@@ -364,13 +365,20 @@ struct LazyHessian{B, F, X}
     f::F
     xs::X
 end
+
 function Base.:*(d::LazyHessian, ys)
-    return pushforward_function(
-        secondlowest(d.ab),
-        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
-        d.xs...,
-    )(ys)
+    if d.xs isa Tuple
+        return pushforward_function(
+            secondlowest(d.backend),
+                (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)(ys)
+    else
+        return pushforward_function(
+            secondlowest(d.backend),
+                (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)(ys)
+    end
+
 end
+
 function Base.:*(ys, d::LazyHessian)
     if ys isa Tuple
         ya = adjoint.(ys)
@@ -378,12 +386,61 @@ function Base.:*(ys, d::LazyHessian)
         ya = adjoint(ys)
     end
     return pullback_function(
-        secondlowest(d.ab),
-        (xs...,) -> gradient(lowest(d.ab), d.f, xs...),
+        secondlowest(d.backend),
+        (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
         d.xs...,
     )(ya)
 end
 
+function Base.:*(d::LazyHessian, ys::Number)
+    if d.xs isa Tuple
+        return hessian(d.backend, d.f, d.xs...).*ys
+    else
+        return hessian(d.backend, d.f, d.xs).*ys
+    end
+end
+
+function Base.:*(ys::Number, d::LazyHessian)
+    if d.xs isa Tuple
+        return ys.*hessian(d.backend, d.f, d.xs...)
+    else
+        return ys.*hessian(d.backend, d.f, d.xs)
+    end
+end
+
+function Base.:*(d::LazyHessian, ys::AbstractArray)
+    if d.xs isa Tuple
+        return pushforward_function(
+            secondlowest(d.backend),
+                (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)((ys,))
+    else
+        return pushforward_function(
+            secondlowest(d.backend),
+                (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)((ys,))
+    end
+end
+
+function Base.:*(ys::AbstractArray, d::LazyHessian)
+    if ys isa Tuple
+        ya = adjoint.(ys)
+    else
+        ya = adjoint(ys)
+    end
+    if d.xs isa Tuple
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
+            d.xs...,
+            )(ya)
+    else
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs,) -> gradient(lowest(d.backend), d.f, xs)[1],
+            d.xs,
+            )(ya)
+    end
+end
+
 function lazyderivative(ab::AbstractBackend, f, xs::Number...)
     return LazyDerivative(ab, f, xs)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9247eb2..06d8365 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -369,6 +369,36 @@ function test_fdm_lazy_jacobians(fdm_backend)
     @test res isa Tuple
 end
 
+function test_fdm_lazy_hessians(fdm_backend)
+    # single input function
+    fhess = x -> fgrad(x, yvec)
+    hess1 = (dfgraddxdx(xvec,yvec),)
+    lazyhess = AD.LazyHessian(fdm_backend, x->fgrad(x, yvec), xvec)
+
+    # multiplication with scalar
+    @test minimum(isapprox.(lazyhess*yscalar, hess1.*yscalar, atol=1e-10))
+    @test lazyhess*yscalar isa Tuple
+
+    # multiplication with scalar
+    @test minimum(isapprox.(yscalar*lazyhess, yscalar.*hess1, atol=1e-10))
+    @test yscalar*lazyhess isa Tuple
+
+    w = adjoint(rand(length(xvec)))
+    v = rand(length(xvec))
+
+    # Hvp
+    Hv = map(h->h*v, hess1)
+    res = lazyhess*v
+    @test minimum(isapprox.(Hv, res, atol=1e-10))
+    @test res isa Tuple
+
+    # H′vp
+    wH = map(h->h'*adjoint(w), hess1)
+    res = w*lazyhess
+    @test minimum(isapprox.(wH, res, atol=1e-10))
+    @test res isa Tuple
+end
+
 @testset "AbstractDifferentiation.jl" begin
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
@@ -417,5 +447,10 @@ end
             test_fdm_lazy_jacobians(fdm_backend2)
             test_fdm_lazy_jacobians(fdm_backend3)
         end
+        @testset "Lazy Hessian" begin
+            test_fdm_lazy_hessians(fdm_backend1)
+            test_fdm_lazy_hessians(fdm_backend2)
+            test_fdm_lazy_hessians(fdm_backend3)
+        end
     end
 end

From 5c0a147a47ed40df4b9157610a3bb6a2760a3412 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 29 Jun 2021 23:42:13 +0200
Subject: [PATCH 15/25] combine general fallback with abstract array

---
 src/AbstractDifferentiation.jl | 91 ++++++++++++----------------------
 test/runtests.jl               |  2 +-
 2 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 5014052..263ad26 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -305,15 +305,18 @@ struct LazyJacobian{B, F, X}
 end
 
 function Base.:*(d::LazyJacobian, ys)
+    if !(ys isa Tuple)
+        ys = (ys, )
+    end
     if d.xs isa Tuple
-        jvp = pushforward_function(d.backend, d.f, d.xs...)(ys)
-        if jvp isa Tuple
-            return vec.(jvp)
-        else
-            return vec(jvp)
-        end
+        vjp = pushforward_function(d.backend, d.f, d.xs...)(ys)
+    else
+        vjp = pushforward_function(d.backend, d.f, d.xs)(ys)
+    end
+    if vjp isa Tuple
+        return vjp
     else
-        return vec.(pushforward_function(d.backend, d.f, d.xs)(ys))
+        return (vjp,)
     end
 end
 
@@ -346,19 +349,6 @@ function Base.:*(ys::Number, d::LazyJacobian)
     end
 end
 
-function Base.:*(d::LazyJacobian, ys::AbstractArray)
-    if d.xs isa Tuple
-        return vec.(pushforward_function(d.backend, d.f, d.xs...)((ys,)))
-    else
-        vjp = (pushforward_function(d.backend, d.f, d.xs)((ys,)))
-        if vjp isa Tuple
-            return vec.(vjp)
-        else
-            return (vec(vjp),)
-        end
-    end
-end
-
 
 struct LazyHessian{B, F, X}
     backend::B
@@ -367,16 +357,19 @@ struct LazyHessian{B, F, X}
 end
 
 function Base.:*(d::LazyHessian, ys)
+    if !(ys isa Tuple)
+        ys = (ys, )
+    end
+
     if d.xs isa Tuple
         return pushforward_function(
             secondlowest(d.backend),
-                (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)(ys)
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)(ys)
     else
         return pushforward_function(
             secondlowest(d.backend),
-                (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)(ys)
+            (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)(ys)
     end
-
 end
 
 function Base.:*(ys, d::LazyHessian)
@@ -385,11 +378,19 @@ function Base.:*(ys, d::LazyHessian)
     else
         ya = adjoint(ys)
     end
-    return pullback_function(
-        secondlowest(d.backend),
-        (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
-        d.xs...,
-    )(ya)
+    if d.xs isa Tuple
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
+            d.xs...,
+            )(ya)
+    else
+        return pullback_function(
+            secondlowest(d.backend),
+            (xs,) -> gradient(lowest(d.backend), d.f, xs)[1],
+            d.xs,
+            )(ya)
+    end
 end
 
 function Base.:*(d::LazyHessian, ys::Number)
@@ -408,38 +409,6 @@ function Base.:*(ys::Number, d::LazyHessian)
     end
 end
 
-function Base.:*(d::LazyHessian, ys::AbstractArray)
-    if d.xs isa Tuple
-        return pushforward_function(
-            secondlowest(d.backend),
-                (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)((ys,))
-    else
-        return pushforward_function(
-            secondlowest(d.backend),
-                (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)((ys,))
-    end
-end
-
-function Base.:*(ys::AbstractArray, d::LazyHessian)
-    if ys isa Tuple
-        ya = adjoint.(ys)
-    else
-        ya = adjoint(ys)
-    end
-    if d.xs isa Tuple
-        return pullback_function(
-            secondlowest(d.backend),
-            (xs...,) -> gradient(lowest(d.backend), d.f, xs...),
-            d.xs...,
-            )(ya)
-    else
-        return pullback_function(
-            secondlowest(d.backend),
-            (xs,) -> gradient(lowest(d.backend), d.f, xs)[1],
-            d.xs,
-            )(ya)
-    end
-end
 
 function lazyderivative(ab::AbstractBackend, f, xs::Number...)
     return LazyDerivative(ab, f, xs)
@@ -511,7 +480,9 @@ function define_pushforward_function_and_friends(fdef)
                     end
                 end
             elseif eltype(identity_like) <: AbstractMatrix
+                # needed for the computation of the Hessian
                 ret = hcat.(mapslices(identity_like[1], dims=1) do cols
+                    # cols loop over basis states   
                     pf = pff((cols,))
                     if typeof(pf) <: AbstractVector
                         return (pf, )
diff --git a/test/runtests.jl b/test/runtests.jl
index 06d8365..c50bdd1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -360,7 +360,7 @@ function test_fdm_lazy_jacobians(fdm_backend)
             identity_like_i .* v
         end
 
-        res = map(v->lazyjac*v, vaug)
+        res = map(v->(lazyjac*v)[1], vaug)
     else
         res = lazyjac*v
     end

From 48af1083ad9e7e5e41397ad332b0278dd8c7add5 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Mon, 26 Jul 2021 19:34:57 +0200
Subject: [PATCH 16/25] reshape gradient and fix for AD.hessian

---
 src/AbstractDifferentiation.jl | 26 +++++++++++++++++++++-----
 test/runtests.jl               | 27 +++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 263ad26..2a0a4eb 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -40,13 +40,23 @@ function derivative(ab::AbstractBackend, f, xs::Number...)
 end
 
 function gradient(ab::AbstractBackend, f, xs...)
-    return adjoint.(jacobian(lowest(ab), f, xs...))
+    return reshape.(adjoint.(jacobian(lowest(ab), f, xs...)),size.(xs))
 end
 function jacobian(ab::AbstractBackend, f, xs...) end
 function hessian(ab::AbstractBackend, f, xs...)
-    return jacobian(secondlowest(ab), (xs...,) -> begin
-        gradient(lowest(ab), f, xs...)
-    end, xs...)
+    xss = collect((xs...,)) 
+    counter = 0
+    # gradient returns tuple of gradient values with respect to inputs x,y ∈ xs
+    # Hessian is the Jacobian of the individual gradients, i.e., the tuple of matrices 
+    # defined by ∂x∂x f, ∂y∂y f, in the case of a scalar valued function `f`.  
+    hess = map((xs...,)) do x
+        counter += 1
+        _f = _x->f(setindex!(deepcopy(xss),x,counter)...)
+        return jacobian(secondlowest(ab),(x,)-> begin
+            return gradient(lowest(ab), _f, x)
+            end, x)[1]
+    end
+    return hess
 end
 
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
@@ -55,7 +65,7 @@ function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
 end
 function value_and_gradient(ab::AbstractBackend, f, xs...)
     value, jacs = value_and_jacobian(lowest(ab), f, xs...)
-    return value, adjoint.(jacs)
+    return value, reshape.(adjoint.(jacs),size.(xs))
 end
 function value_and_jacobian(ab::AbstractBackend, f, xs...)
     local value
@@ -481,9 +491,12 @@ function define_pushforward_function_and_friends(fdef)
                 end
             elseif eltype(identity_like) <: AbstractMatrix
                 # needed for the computation of the Hessian
+                println("define_pushforward_function_and_friends")
+                @show identity_like
                 ret = hcat.(mapslices(identity_like[1], dims=1) do cols
                     # cols loop over basis states   
                     pf = pff((cols,))
+                    @show cols pf
                     if typeof(pf) <: AbstractVector
                         return (pf, )
                     elseif typeof(pf) <: AbstractMatrix
@@ -518,6 +531,9 @@ function define_pullback_function_and_friends(fdef)
                     end
                 end
             elseif eltype(identity_like) <: AbstractMatrix
+                # needed for Hessian computation:
+                # value is a (grad,). Then, identity_like is a (matrix,).
+                # cols loops over columns of the matrix  
                 return vcat.(mapslices(identity_like[1], dims=1) do cols
                     adjoint.(value_and_pbf((cols,))[2])
                 end ...)
diff --git a/test/runtests.jl b/test/runtests.jl
index c50bdd1..23ce6dc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -52,6 +52,7 @@ fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
 dfgraddx(x, y) = prod(x)./x
 dfgraddy(x, y) = one(eltype(y)) ./ (1:length(y))
 dfgraddxdx(x, y) = prod(x)./(x*x') - Diagonal(diag(prod(x)./(x*x')))
+dfgraddydy(x, y) = zeros(length(y),length(y))
 
 function fjac(x, y)
     x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
@@ -89,6 +90,13 @@ function test_fdm_derivatives(fdm_backend)
     @test der3 .- der1 == (0, 0)
     der_exact = (dfderdx(xscalar,yscalar), dfderdy(xscalar,yscalar))
     @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
+    # test if single input (no tuple works)
+    valscalara, dera = AD.value_and_derivative(fdm_backend, x -> fder(x, yscalar), xscalar)
+    valscalarb, derb = AD.value_and_derivative(fdm_backend, y -> fder(xscalar, y), yscalar)
+    @test valscalar == valscalara
+    @test valscalar == valscalarb
+    @test isapprox(dera[1], der1[1], rtol=1e-10)
+    @test isapprox(derb[1], der1[2], rtol=1e-10)
 end
 
 function test_fdm_gradients(fdm_backend)
@@ -102,6 +110,13 @@ function test_fdm_gradients(fdm_backend)
     @test minimum(isapprox.(grad_exact, grad1, rtol=1e-10))
     @test xvec == xvec2
     @test yvec == yvec2
+    # test if single input (no tuple works)
+    valscalara, grada = AD.value_and_gradient(fdm_backend, x -> fgrad(x, yvec), xvec)
+    valscalarb, gradb = AD.value_and_gradient(fdm_backend, y -> fgrad(xvec, y), yvec)
+    @test valscalar == valscalara
+    @test valscalar == valscalarb
+    @test isapprox(grada[1], grad1[1], rtol=1e-10)
+    @test isapprox(gradb[1], grad1[2], rtol=1e-10)
 end
 
 function test_fdm_jacobians(fdm_backend)
@@ -115,9 +130,21 @@ function test_fdm_jacobians(fdm_backend)
     @test minimum(isapprox.(grad_exact, jac1, rtol=1e-10))
     @test xvec == xvec2
     @test yvec == yvec2
+    # test if single input (no tuple works)
+    valveca, jaca = AD.value_and_jacobian(fdm_backend, x -> fjac(x, yvec), xvec)
+    valvecb, jacb = AD.value_and_jacobian(fdm_backend, y -> fjac(xvec, y), yvec)
+    @test valvec == valveca
+    @test valvec == valvecb
+    @test isapprox(jaca[1], jac1[1], rtol=1e-10)
+    @test isapprox(jacb[1], jac1[2], rtol=1e-10)
 end
 
 function test_fdm_hessians(fdm_backend)
+    H1 = AD.hessian(fdm_backend, fgrad, xvec, yvec)
+    @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10
+    @test dfgraddydy(xvec,yvec) ≈ H1[2] atol=1e-10
+
+    # test if single input (no tuple works)
     fhess = x -> fgrad(x, yvec)
     hess1 = AD.hessian(fdm_backend, fhess, xvec)
     hess2 = FDM.jacobian(

From a9f1538d8154c40e53d8bea0aae2198932b40250 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 27 Jul 2021 16:17:18 +0200
Subject: [PATCH 17/25] remove prints

---
 src/AbstractDifferentiation.jl | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 2a0a4eb..15ba75b 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -51,7 +51,7 @@ function hessian(ab::AbstractBackend, f, xs...)
     # defined by ∂x∂x f, ∂y∂y f, in the case of a scalar valued function `f`.  
     hess = map((xs...,)) do x
         counter += 1
-        _f = _x->f(setindex!(deepcopy(xss),x,counter)...)
+        _f = _x->f(setindex!(deepcopy(xss),_x,counter)...)
         return jacobian(secondlowest(ab),(x,)-> begin
             return gradient(lowest(ab), _f, x)
             end, x)[1]
@@ -490,17 +490,13 @@ function define_pushforward_function_and_friends(fdef)
                     end
                 end
             elseif eltype(identity_like) <: AbstractMatrix
-                # needed for the computation of the Hessian
-                println("define_pushforward_function_and_friends")
-                @show identity_like
+                # needed for the computation of the Hessian and Jacobian
                 ret = hcat.(mapslices(identity_like[1], dims=1) do cols
                     # cols loop over basis states   
                     pf = pff((cols,))
-                    @show cols pf
                     if typeof(pf) <: AbstractVector
+                        # to make the hcat. work / get correct matrix-like, non-flat output dimension
                         return (pf, )
-                    elseif typeof(pf) <: AbstractMatrix
-                        return (transpose(pf), )
                     else
                         return pf
                     end

From 60ef4528d0cf1b19122c2376f1982631dc0cad61 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Thu, 29 Jul 2021 11:23:18 +0200
Subject: [PATCH 18/25] add ForwardDiff

---
 Project.toml                   |   3 +-
 src/AbstractDifferentiation.jl |   1 +
 test/runtests.jl               | 323 ++++++++++++++++++++++-----------
 3 files changed, 225 insertions(+), 102 deletions(-)

diff --git a/Project.toml b/Project.toml
index 38eefd4..0ca1a6e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,8 +12,9 @@ julia = "1"
 
 [extras]
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "FiniteDifferences", "Random"]
+test = ["Test", "FiniteDifferences", "ForwardDiff", "Random"]
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 15ba75b..306fe65 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -51,6 +51,7 @@ function hessian(ab::AbstractBackend, f, xs...)
     # defined by ∂x∂x f, ∂y∂y f, in the case of a scalar valued function `f`.  
     hess = map((xs...,)) do x
         counter += 1
+        # needs to be modified for forward mode AD; no method matching error from setindex!
         _f = _x->f(setindex!(deepcopy(xss),_x,counter)...)
         return jacobian(secondlowest(ab),(x,)-> begin
             return gradient(lowest(ab), _f, x)
diff --git a/test/runtests.jl b/test/runtests.jl
index 23ce6dc..05e9df6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,10 +1,12 @@
 using AbstractDifferentiation
 using Test, FiniteDifferences, LinearAlgebra
+using ForwardDiff
 using Random
 Random.seed!(1234)
 
 const FDM = FiniteDifferences
 
+## FiniteDifferences
 struct FDMBackend1{A} <: AD.AbstractFiniteDifference
     alg::A
 end
@@ -43,6 +45,77 @@ AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
         end
     end
 end
+##
+
+
+## ForwardDiff
+struct ForwardDiffBackend1 <: AD.AbstractForwardMode end
+const forwarddiff_backend1 = ForwardDiffBackend1()
+AD.@primitive function jacobian(ab::ForwardDiffBackend1, f, xs...)
+    if xs isa AbstractArray
+        return ForwardDiff.jacobian(f, xs)
+    elseif xs isa Number
+        return ForwardDiff.derivative(f, xs)
+    elseif xs isa Tuple
+        @assert length(xs) <= 2
+        if eltype(xs) <: Number
+            # Can this be avoided for the derivative computation? It seems to defeat the purpose of AbstractDifferentiation. 
+            # If eltype(xs) isa Number, ForwardDiff will error by saying that Jacobian expects an array.
+            # if we convert the number to an array in AbstractDifferentiation, we get a no method error due to log(x) and exp(x)
+            if length(xs) == 1
+                (ForwardDiff.derivative(f, xs[1]),)
+            else
+                _f1 = x -> f(x,xs[2])
+                _f2 = x -> f(xs[1], x)
+                (ForwardDiff.derivative(_f1, xs[1]),ForwardDiff.derivative(_f2, xs[2]))
+            end      
+        else
+            if length(xs) == 1
+                out = f(xs[1])
+                if out isa Number
+                    # Lazy Jacobian test pb fails otherwise with "dimensions must match: a has dims (Base.OneTo(1), Base.OneTo(5)), b has dims (Base.OneTo(5),), mismatch at 1"
+                    (reshape(ForwardDiff.gradient(f, xs[1]),(1,length(xs[1]))),)
+                else
+                    (ForwardDiff.jacobian(f, xs[1]),)
+                end
+            else
+                out = f(xs...)
+                _f1 = x -> f(x,xs[2])
+                _f2 = x -> f(xs[1], x)
+                if out isa Number
+                    # reshape for pullback tests (similar as above)
+                    (reshape(ForwardDiff.gradient(_f1, xs[1]),(1,length(xs[1]))),
+                    reshape(ForwardDiff.gradient(_f2, xs[2]), (1, length(xs[2]))))
+                else
+                    (ForwardDiff.jacobian(_f1, xs[1]),ForwardDiff.jacobian(_f2, xs[2]))
+                end
+                
+            end    
+        end 
+    else
+        error(typeof(xs)) 
+    end
+end
+
+struct ForwardDiffBackend2 <: AD.AbstractForwardMode end
+const forwarddiff_backend2 = ForwardDiffBackend2()
+AD.@primitive function pushforward_function(ab::ForwardDiffBackend2, f, xs...)
+    # jvp = f'(x)*v, i.e., differentiate f(x + h*v) wrt h at 0
+    return function (vs)
+        if xs isa Tuple
+            @assert length(xs) <= 2
+            if length(xs) == 1
+                (ForwardDiff.derivative(h->f(xs[1]+h*vs[1]),0),)
+            else
+                ForwardDiff.derivative(h->f(xs[1]+h*vs[1], xs[2]+h*vs[2]),0)
+            end
+        else
+            ForwardDiff.derivative(h->f(xs+h*vs),0)
+        end
+    end
+end
+##
+
 
 fder(x, y) = exp(y) * x + y * log(x)
 dfderdx(x, y) = exp(y) + y * 1/x
@@ -78,32 +151,34 @@ const yvec = rand(5)
 xvec2 = deepcopy(xvec)
 yvec2 = deepcopy(yvec)
 
-function test_fdm_derivatives(fdm_backend)
-    der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+
+function test_fdm_derivatives(backend, fdm_backend)
+    # fdm_backend for comparison with finite differences 
+    der1 = AD.derivative(backend, fder, xscalar, yscalar)
     der2 = (
         fdm_backend.alg(x -> fder(x, yscalar), xscalar),
         fdm_backend.alg(y -> fder(xscalar, y), yscalar),
     )
-    @test norm.(der1 .- der2) == (0, 0)
-    valscalar, der3 = AD.value_and_derivative(fdm_backend, fder, xscalar, yscalar)
+    @test minimum(isapprox.(der1, der2, rtol=1e-10))
+    valscalar, der3 = AD.value_and_derivative(backend, fder, xscalar, yscalar)
     @test valscalar == fder(xscalar, yscalar)
     @test der3 .- der1 == (0, 0)
     der_exact = (dfderdx(xscalar,yscalar), dfderdy(xscalar,yscalar))
     @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
     # test if single input (no tuple works)
-    valscalara, dera = AD.value_and_derivative(fdm_backend, x -> fder(x, yscalar), xscalar)
-    valscalarb, derb = AD.value_and_derivative(fdm_backend, y -> fder(xscalar, y), yscalar)
+    valscalara, dera = AD.value_and_derivative(backend, x -> fder(x, yscalar), xscalar)
+    valscalarb, derb = AD.value_and_derivative(backend, y -> fder(xscalar, y), yscalar)
     @test valscalar == valscalara
     @test valscalar == valscalarb
     @test isapprox(dera[1], der1[1], rtol=1e-10)
     @test isapprox(derb[1], der1[2], rtol=1e-10)
 end
 
-function test_fdm_gradients(fdm_backend)
-    grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+function test_fdm_gradients(backend, fdm_backend)
+    grad1 = AD.gradient(backend, fgrad, xvec, yvec)
     grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
-    @test norm.(grad1 .- grad2) == (0, 0)
-    valscalar, grad3 = AD.value_and_gradient(fdm_backend, fgrad, xvec, yvec)
+    @test minimum(isapprox.(grad1, grad2, rtol=1e-10))
+    valscalar, grad3 = AD.value_and_gradient(backend, fgrad, xvec, yvec)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad3 .- grad1) == (0, 0)
     grad_exact = (dfgraddx(xvec,yvec), dfgraddy(xvec,yvec))
@@ -111,19 +186,19 @@ function test_fdm_gradients(fdm_backend)
     @test xvec == xvec2
     @test yvec == yvec2
     # test if single input (no tuple works)
-    valscalara, grada = AD.value_and_gradient(fdm_backend, x -> fgrad(x, yvec), xvec)
-    valscalarb, gradb = AD.value_and_gradient(fdm_backend, y -> fgrad(xvec, y), yvec)
+    valscalara, grada = AD.value_and_gradient(backend, x -> fgrad(x, yvec), xvec)
+    valscalarb, gradb = AD.value_and_gradient(backend, y -> fgrad(xvec, y), yvec)
     @test valscalar == valscalara
     @test valscalar == valscalarb
     @test isapprox(grada[1], grad1[1], rtol=1e-10)
     @test isapprox(gradb[1], grad1[2], rtol=1e-10)
 end
 
-function test_fdm_jacobians(fdm_backend)
-    jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+function test_fdm_jacobians(backend,fdm_backend)
+    jac1 = AD.jacobian(backend, fjac, xvec, yvec)
     jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
-    @test norm.(jac1 .- jac2) == (0, 0)
-    valvec, jac3 = AD.value_and_jacobian(fdm_backend, fjac, xvec, yvec)
+    @test  minimum(isapprox.(jac1, jac2, rtol=1e-10))
+    valvec, jac3 = AD.value_and_jacobian(backend, fjac, xvec, yvec)
     @test valvec == fjac(xvec, yvec)
     @test norm.(jac3 .- jac1) == (0, 0)
     grad_exact = (dfjacdx(xvec, yvec), dfjacdy(xvec, yvec))
@@ -131,22 +206,22 @@ function test_fdm_jacobians(fdm_backend)
     @test xvec == xvec2
     @test yvec == yvec2
     # test if single input (no tuple works)
-    valveca, jaca = AD.value_and_jacobian(fdm_backend, x -> fjac(x, yvec), xvec)
-    valvecb, jacb = AD.value_and_jacobian(fdm_backend, y -> fjac(xvec, y), yvec)
+    valveca, jaca = AD.value_and_jacobian(backend, x -> fjac(x, yvec), xvec)
+    valvecb, jacb = AD.value_and_jacobian(backend, y -> fjac(xvec, y), yvec)
     @test valvec == valveca
     @test valvec == valvecb
     @test isapprox(jaca[1], jac1[1], rtol=1e-10)
     @test isapprox(jacb[1], jac1[2], rtol=1e-10)
 end
 
-function test_fdm_hessians(fdm_backend)
-    H1 = AD.hessian(fdm_backend, fgrad, xvec, yvec)
+function test_fdm_hessians(backend, fdm_backend)
+    H1 = AD.hessian(backend, fgrad, xvec, yvec)
     @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10
     @test dfgraddydy(xvec,yvec) ≈ H1[2] atol=1e-10
 
     # test if single input (no tuple works)
     fhess = x -> fgrad(x, yvec)
-    hess1 = AD.hessian(fdm_backend, fhess, xvec)
+    hess1 = AD.hessian(backend, fhess, xvec)
     hess2 = FDM.jacobian(
         fdm_backend.alg,
         (x) -> begin
@@ -158,43 +233,43 @@ function test_fdm_hessians(fdm_backend)
         end,
         xvec,
     )
-    @test norm.(hess1 .- hess2) == (0,)
-    valscalar, hess3 = AD.value_and_hessian(fdm_backend, fhess, xvec)
+    @test minimum(isapprox.(hess1, hess2, rtol=1e-10))
+    valscalar, hess3 = AD.value_and_hessian(backend, fhess, xvec)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(hess3 .- hess1) == (0,)
-    valscalar, grad, hess4 = AD.value_gradient_and_hessian(fdm_backend, fhess, xvec)
+    valscalar, grad, hess4 = AD.value_gradient_and_hessian(backend, fhess, xvec)
     @test valscalar == fgrad(xvec, yvec)
-    @test norm.(grad .- AD.gradient(fdm_backend, fhess, xvec)) == (0,)
+    @test norm.(grad .- AD.gradient(backend, fhess, xvec)) == (0,)
     @test norm.(hess4 .- hess1) == (0,)
     @test dfgraddxdx(xvec,yvec) ≈ hess1[1] atol=1e-10
     @test xvec == xvec2
     @test yvec == yvec2
     fhess2 = x-> dfgraddx(x, yvec)
-    hess5 = AD.jacobian(fdm_backend, fhess2, xvec)
+    hess5 = AD.jacobian(backend, fhess2, xvec)
     @test minimum(isapprox.(hess5, hess1, atol=1e-10))
 end
 
-function test_fdm_jvp(fdm_backend)
+function test_fdm_jvp(backend,fdm_backend)
     v = (rand(length(xvec)), rand(length(yvec)))
 
-    if fdm_backend isa FDMBackend2 # augmented version of v
+    if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
         identity_like = AD.identity_matrix_like(v)
         vaug = map(identity_like) do identity_like_i
             identity_like_i .* v
         end
 
-        pf1 = map(v->AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
-        ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v), vaug)
+        pf1 = map(v->AD.pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
+        ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
     else
-        pf1 = AD.pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
-        valvec, pf3 = AD.value_and_pushforward_function(fdm_backend, fjac, xvec, yvec)(v)
+        pf1 = AD.pushforward_function(backend, fjac, xvec, yvec)(v)
+        valvec, pf3 = AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v)
         ((valvec1, pf3x), (valvec2, pf3y)) = (valvec, pf3[1]), (valvec, pf3[2])
     end
     pf2 = (
         FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
-    @test norm.(pf1 .- pf2) == (0, 0)
+    @test minimum(isapprox.(pf1, pf2, rtol=1e-10))
 
     @test valvec1 == fjac(xvec, yvec)
     @test valvec2 == fjac(xvec, yvec)
@@ -204,12 +279,12 @@ function test_fdm_jvp(fdm_backend)
     @test yvec == yvec2
 end
 
-function test_fdm_j′vp(fdm_backend)
+function test_fdm_j′vp(backend,fdm_backend)
     w = rand(length(fjac(xvec, yvec)))
-    pb1 = AD.pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+    pb1 = AD.pullback_function(backend, fjac, xvec, yvec)(w)
     pb2 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
     @test all(norm.(pb1 .- pb2) .<= (1e-10, 1e-10))
-    valvec, pb3 = AD.value_and_pullback_function(fdm_backend, fjac, xvec, yvec)(w)
+    valvec, pb3 = AD.value_and_pullback_function(backend, fjac, xvec, yvec)(w)
     @test valvec == fjac(xvec, yvec)
     @test norm.(pb3 .- pb1) == (0, 0)
     @test minimum(isapprox.(pb1, (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
@@ -217,31 +292,31 @@ function test_fdm_j′vp(fdm_backend)
     @test yvec == yvec2
 end
 
-function test_fdm_lazy_derivatives(fdm_backend)
+function test_fdm_lazy_derivatives(backend,fdm_backend)
     # single input function
-    der1 = AD.derivative(fdm_backend, x->fder(x, yscalar), xscalar)
+    der1 = AD.derivative(backend, x->fder(x, yscalar), xscalar)
     der2 = (
         fdm_backend.alg(x -> fder(x, yscalar), xscalar),
         fdm_backend.alg(y -> fder(xscalar, y), yscalar),
     )
 
-    lazyder = AD.LazyDerivative(fdm_backend, x->fder(x, yscalar), xscalar)
+    lazyder = AD.LazyDerivative(backend, x->fder(x, yscalar), xscalar)
 
     # multiplication with scalar
-    @test der1[1]*yscalar == der2[1]*yscalar
+    @test isapprox(der1[1]*yscalar, der2[1]*yscalar, atol=1e-10) 
     @test lazyder*yscalar == der1.*yscalar
     @test lazyder*yscalar isa Tuple
 
-    @test yscalar*der1[1] == yscalar*der2[1]
-    @test yscalar*lazyder == yscalar.*der1
+    @test isapprox(yscalar*der1[1], yscalar*der2[1], atol=1e-10)
+    @test yscalar*lazyder == yscalar.*der1 
     @test yscalar*lazyder isa Tuple
 
     # multiplication with array
-    @test der1[1]*yvec == der2[1]*yvec
+    @test isapprox(der1[1]*yvec, der2[1]*yvec, atol=1e-10)
     @test lazyder*yvec == (der1.*yvec,)
     @test lazyder*yvec isa Tuple
 
-    @test yvec*der1[1] == yvec*der2[1]
+    @test isapprox(yvec*der1[1], yvec*der2[1], atol=1e-10)
     @test yvec*lazyder == (yvec.*der1,)
     @test yvec*lazyder isa Tuple
 
@@ -253,29 +328,29 @@ function test_fdm_lazy_derivatives(fdm_backend)
     @test (yvec,)*lazyder == yvec*lazyder
 
     # two input function
-    der1 = AD.derivative(fdm_backend, fder, xscalar, yscalar)
+    der1 = AD.derivative(backend, fder, xscalar, yscalar)
     der2 = (
         fdm_backend.alg(x -> fder(x, yscalar), xscalar),
         fdm_backend.alg(y -> fder(xscalar, y), yscalar),
     )
 
-    lazyder = AD.LazyDerivative(fdm_backend, fder, (xscalar, yscalar))
+    lazyder = AD.LazyDerivative(backend, fder, (xscalar, yscalar))
 
     # multiplication with scalar
-    @test der1.*yscalar == der2.*yscalar
+    @test minimum(isapprox.(der1.*yscalar, der2.*yscalar, atol=1e-10))
     @test lazyder*yscalar == der1.*yscalar
     @test lazyder*yscalar isa Tuple
 
-    @test yscalar.*der1 == yscalar.*der2
+    @test minimum(isapprox.(yscalar.*der1, yscalar.*der2, atol=1e-10)) 
     @test yscalar*lazyder == yscalar.*der1
     @test yscalar*lazyder isa Tuple
 
     # multiplication with array
-    @test (der1[1]*yvec, der1[2]*yvec) == (der2[1]*yvec, der2[2]*yvec)
+    @test minimum(isapprox.((der1[1]*yvec, der1[2]*yvec), (der2[1]*yvec, der2[2]*yvec), atol=1e-10))
     @test lazyder*yvec == (der1[1]*yvec, der1[2]*yvec)
     @test lazyder*yvec isa Tuple
 
-    @test (yvec*der1[1], yvec*der1[2]) == (yvec*der2[1], yvec*der2[2])
+    @test minimum(isapprox.((yvec*der1[1], yvec*der1[2]), (yvec*der2[1], yvec*der2[2]), atol=1e-10))
     @test yvec*lazyder == (yvec*der1[1], yvec*der1[2])
     @test lazyder*yvec isa Tuple
 
@@ -287,18 +362,18 @@ function test_fdm_lazy_derivatives(fdm_backend)
     @test (yvec,)*lazyder == yvec*lazyder
 end
 
-function test_fdm_lazy_gradients(fdm_backend)
+function test_fdm_lazy_gradients(backend,fdm_backend)
     # single input function
-    grad1 = AD.gradient(fdm_backend, x->fgrad(x, yvec), xvec)
+    grad1 = AD.gradient(backend, x->fgrad(x, yvec), xvec)
     grad2 = FDM.grad(fdm_backend.alg, x->fgrad(x, yvec), xvec)
-    lazygrad = AD.LazyGradient(fdm_backend, x->fgrad(x, yvec), xvec)
+    lazygrad = AD.LazyGradient(backend, x->fgrad(x, yvec), xvec)
 
     # multiplication with scalar
-    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,)
+    @test minimum(isapprox.(grad1.*yscalar, grad2.*yscalar, atol=1e-10))
     @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,)
     @test lazygrad*yscalar isa Tuple
 
-    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,)
+    @test minimum(isapprox.(yscalar.*grad1, yscalar.*grad2, atol=1e-10))
     @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,)
     @test yscalar*lazygrad isa Tuple
 
@@ -307,16 +382,16 @@ function test_fdm_lazy_gradients(fdm_backend)
     @test (yscalar,)*lazygrad == yscalar*lazygrad
 
     # two input function
-    grad1 = AD.gradient(fdm_backend, fgrad, xvec, yvec)
+    grad1 = AD.gradient(backend, fgrad, xvec, yvec)
     grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
-    lazygrad = AD.LazyGradient(fdm_backend, fgrad, (xvec, yvec))
+    lazygrad = AD.LazyGradient(backend, fgrad, (xvec, yvec))
 
     # multiplication with scalar
-    @test norm.(grad1.*yscalar .- grad2.*yscalar) == (0,0)
+    @test minimum(isapprox.(grad1.*yscalar, grad2.*yscalar,  atol=1e-10))
     @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,0)
     @test lazygrad*yscalar isa Tuple
 
-    @test norm.(yscalar.*grad1 .- yscalar.*grad2) == (0,0)
+    @test minimum(isapprox.(yscalar.*grad1, yscalar.*grad2, atol=1e-10))
     @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,0)
     @test yscalar*lazygrad isa Tuple
 
@@ -325,18 +400,18 @@ function test_fdm_lazy_gradients(fdm_backend)
     @test (yscalar,)*lazygrad == yscalar*lazygrad
 end
 
-function test_fdm_lazy_jacobians(fdm_backend)
+function test_fdm_lazy_jacobians(backend,fdm_backend)
     # single input function
-    jac1 = AD.jacobian(fdm_backend, x->fjac(x, yvec), xvec)
+    jac1 = AD.jacobian(backend, x->fjac(x, yvec), xvec)
     jac2 = FDM.jacobian(fdm_backend.alg, x->fjac(x, yvec), xvec)
-    lazyjac = AD.LazyJacobian(fdm_backend, x->fjac(x, yvec), xvec)
+    lazyjac = AD.LazyJacobian(backend, x->fjac(x, yvec), xvec)
 
     # multiplication with scalar
-    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,)
+    @test minimum(isapprox.(jac1.*yscalar, jac2.*yscalar, atol=1e-10))
     @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,)
     @test lazyjac*yscalar isa Tuple
 
-    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,)
+    @test minimum(isapprox.(yscalar.*jac1, yscalar.*jac2, atol=1e-10))
     @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,)
     @test yscalar*lazyjac isa Tuple
 
@@ -356,16 +431,16 @@ function test_fdm_lazy_jacobians(fdm_backend)
     @test res isa Tuple
 
     # two input function
-    jac1 = AD.jacobian(fdm_backend, fjac, xvec, yvec)
+    jac1 = AD.jacobian(backend, fjac, xvec, yvec)
     jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
-    lazyjac = AD.LazyJacobian(fdm_backend, fjac, (xvec, yvec))
+    lazyjac = AD.LazyJacobian(backend, fjac, (xvec, yvec))
 
     # multiplication with scalar
-    @test norm.(jac1.*yscalar .- jac2.*yscalar) == (0,0)
+    @test minimum(isapprox.(jac1.*yscalar, jac2.*yscalar, atol=1e-10))
     @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,0)
     @test lazyjac*yscalar isa Tuple
 
-    @test norm.(yscalar.*jac1 .- yscalar.*jac2) == (0,0)
+    @test minimum(isapprox.(yscalar.*jac1, yscalar.*jac2, atol=1e-10))
     @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,0)
     @test yscalar*lazyjac isa Tuple
 
@@ -381,7 +456,7 @@ function test_fdm_lazy_jacobians(fdm_backend)
         FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
     )
 
-    if fdm_backend isa FDMBackend2 # augmented version of v
+    if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
         identity_like = AD.identity_matrix_like(v)
         vaug = map(identity_like) do identity_like_i
             identity_like_i .* v
@@ -396,11 +471,12 @@ function test_fdm_lazy_jacobians(fdm_backend)
     @test res isa Tuple
 end
 
-function test_fdm_lazy_hessians(fdm_backend)
+function test_fdm_lazy_hessians(backend,fdm_backend)
+    # fdm_backend not used here yet..
     # single input function
     fhess = x -> fgrad(x, yvec)
     hess1 = (dfgraddxdx(xvec,yvec),)
-    lazyhess = AD.LazyHessian(fdm_backend, x->fgrad(x, yvec), xvec)
+    lazyhess = AD.LazyHessian(backend, fhess, xvec)
 
     # multiplication with scalar
     @test minimum(isapprox.(lazyhess*yscalar, hess1.*yscalar, atol=1e-10))
@@ -427,57 +503,102 @@ function test_fdm_lazy_hessians(fdm_backend)
 end
 
 @testset "AbstractDifferentiation.jl" begin
+    testFDMbackend = fdm_backend1
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
-            test_fdm_derivatives(fdm_backend1)
-            test_fdm_derivatives(fdm_backend2)
-            test_fdm_derivatives(fdm_backend3)
+            test_fdm_derivatives(fdm_backend1,testFDMbackend)
+            test_fdm_derivatives(fdm_backend2,testFDMbackend)
+            test_fdm_derivatives(fdm_backend3,testFDMbackend)
         end
         @testset "Gradient" begin
-            test_fdm_gradients(fdm_backend1)
-            test_fdm_gradients(fdm_backend2)
-            test_fdm_gradients(fdm_backend3)
+            test_fdm_gradients(fdm_backend1,testFDMbackend)
+            test_fdm_gradients(fdm_backend2,testFDMbackend)
+            test_fdm_gradients(fdm_backend3,testFDMbackend
+            )
         end
         @testset "Jacobian" begin
-            test_fdm_jacobians(fdm_backend1)
-            test_fdm_jacobians(fdm_backend2)
-            test_fdm_jacobians(fdm_backend3)
+            test_fdm_jacobians(fdm_backend1,testFDMbackend)
+            test_fdm_jacobians(fdm_backend2,testFDMbackend)
+            test_fdm_jacobians(fdm_backend3,testFDMbackend)
         end
         @testset "Hessian" begin
             # Works but super slow
-            test_fdm_hessians(fdm_backend1)
-            test_fdm_hessians(fdm_backend2)
-            test_fdm_hessians(fdm_backend3)
+            test_fdm_hessians(fdm_backend1,testFDMbackend)
+            test_fdm_hessians(fdm_backend2,testFDMbackend)
+            test_fdm_hessians(fdm_backend3,testFDMbackend)
+        end
+        @testset "jvp" begin
+            test_fdm_jvp(fdm_backend1,testFDMbackend)
+            test_fdm_jvp(fdm_backend2,testFDMbackend)
+            test_fdm_jvp(fdm_backend3,testFDMbackend)
+        end
+        @testset "j′vp" begin
+            test_fdm_j′vp(fdm_backend1,testFDMbackend)
+            test_fdm_j′vp(fdm_backend2,testFDMbackend)
+            test_fdm_j′vp(fdm_backend3,testFDMbackend)
+        end
+        @testset "Lazy Derivative" begin
+            test_fdm_lazy_derivatives(fdm_backend1,testFDMbackend)
+            test_fdm_lazy_derivatives(fdm_backend2,testFDMbackend)
+            test_fdm_lazy_derivatives(fdm_backend3,testFDMbackend)
+        end
+        @testset "Lazy Gradient" begin
+            test_fdm_lazy_gradients(fdm_backend1,testFDMbackend)
+            test_fdm_lazy_gradients(fdm_backend2,testFDMbackend)
+            test_fdm_lazy_gradients(fdm_backend3,testFDMbackend)
+        end
+        @testset "Lazy Jacobian" begin
+            test_fdm_lazy_jacobians(fdm_backend1,testFDMbackend)
+            test_fdm_lazy_jacobians(fdm_backend2,testFDMbackend)
+            test_fdm_lazy_jacobians(fdm_backend3,testFDMbackend)
+        end
+        @testset "Lazy Hessian" begin
+            test_fdm_lazy_hessians(fdm_backend1,testFDMbackend)
+            test_fdm_lazy_hessians(fdm_backend2,testFDMbackend)
+            test_fdm_lazy_hessians(fdm_backend3,testFDMbackend)
+        end
+    end
+    @testset "ForwardDiff" begin
+        @testset "Derivative" begin
+            test_fdm_derivatives(forwarddiff_backend1,testFDMbackend)
+            test_fdm_derivatives(forwarddiff_backend2,testFDMbackend)
+        end
+        @testset "Gradient" begin
+            test_fdm_gradients(forwarddiff_backend1,testFDMbackend)
+            test_fdm_gradients(forwarddiff_backend2,testFDMbackend)
+        end
+        @testset "Jacobian" begin
+            test_fdm_jacobians(forwarddiff_backend1,testFDMbackend)
+            test_fdm_jacobians(forwarddiff_backend2,testFDMbackend)
+        end
+        @testset "Hessian" begin
+            # Fails due to setindex! in AbstractDifferentiation Hessian function
+            @test_broken test_fdm_hessians(forwarddiff_backend1,testFDMbackend)
+            @test_broken test_fdm_hessians(forwarddiff_backend2,testFDMbackend)
         end
         @testset "jvp" begin
-            test_fdm_jvp(fdm_backend1)
-            test_fdm_jvp(fdm_backend2)
-            test_fdm_jvp(fdm_backend3)
+            test_fdm_jvp(forwarddiff_backend1,testFDMbackend)
+            test_fdm_jvp(forwarddiff_backend2,testFDMbackend)
         end
         @testset "j′vp" begin
-            test_fdm_j′vp(fdm_backend1)
-            test_fdm_j′vp(fdm_backend2)
-            test_fdm_j′vp(fdm_backend3)
+            test_fdm_j′vp(forwarddiff_backend1,testFDMbackend)
+            test_fdm_j′vp(forwarddiff_backend2,testFDMbackend)
         end
         @testset "Lazy Derivative" begin
-            test_fdm_lazy_derivatives(fdm_backend1)
-            test_fdm_lazy_derivatives(fdm_backend2)
-            test_fdm_lazy_derivatives(fdm_backend3)
+            test_fdm_lazy_derivatives(forwarddiff_backend1,testFDMbackend)
+            test_fdm_lazy_derivatives(forwarddiff_backend2,testFDMbackend)
         end
         @testset "Lazy Gradient" begin
-            test_fdm_lazy_gradients(fdm_backend1)
-            test_fdm_lazy_gradients(fdm_backend2)
-            test_fdm_lazy_gradients(fdm_backend3)
+            test_fdm_lazy_gradients(forwarddiff_backend1,testFDMbackend)
+            test_fdm_lazy_gradients(forwarddiff_backend2,testFDMbackend)
         end
         @testset "Lazy Jacobian" begin
-            test_fdm_lazy_jacobians(fdm_backend1)
-            test_fdm_lazy_jacobians(fdm_backend2)
-            test_fdm_lazy_jacobians(fdm_backend3)
+            test_fdm_lazy_jacobians(forwarddiff_backend1,testFDMbackend)
+            test_fdm_lazy_jacobians(forwarddiff_backend2,testFDMbackend)
         end
         @testset "Lazy Hessian" begin
-            test_fdm_lazy_hessians(fdm_backend1)
-            test_fdm_lazy_hessians(fdm_backend2)
-            test_fdm_lazy_hessians(fdm_backend3)
+            #@test_broken test_fdm_lazy_hessians(forwarddiff_backend1,testFDMbackend)
+            #@test_broken test_fdm_lazy_hessians(forwarddiff_backend2,testFDMbackend)
         end
     end
 end

From c0c489aeb89d2c35eac3e97f2a4e7f288f7e09a3 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Mon, 2 Aug 2021 21:36:53 +0200
Subject: [PATCH 19/25] ForwardDiff and test updates

---
 src/AbstractDifferentiation.jl |  39 ++-
 test/runtests.jl               | 554 +++++++++++++++------------------
 2 files changed, 280 insertions(+), 313 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 306fe65..643d60f 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -44,20 +44,13 @@ function gradient(ab::AbstractBackend, f, xs...)
 end
 function jacobian(ab::AbstractBackend, f, xs...) end
 function hessian(ab::AbstractBackend, f, xs...)
-    xss = collect((xs...,)) 
-    counter = 0
-    # gradient returns tuple of gradient values with respect to inputs x,y ∈ xs
-    # Hessian is the Jacobian of the individual gradients, i.e., the tuple of matrices 
-    # defined by ∂x∂x f, ∂y∂y f, in the case of a scalar valued function `f`.  
-    hess = map((xs...,)) do x
-        counter += 1
-        # needs to be modified for forward mode AD; no method matching error from setindex!
-        _f = _x->f(setindex!(deepcopy(xss),_x,counter)...)
-        return jacobian(secondlowest(ab),(x,)-> begin
-            return gradient(lowest(ab), _f, x)
-            end, x)[1]
-    end
-    return hess
+    if xs isa Tuple
+        # only support computation of Hessian for functions with single input argument
+        @assert length(xs) == 1
+    end
+    return jacobian(secondlowest(ab), (xs...,) -> begin
+        gradient(lowest(ab), f, xs...)
+    end, xs...)
 end
 
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
@@ -168,7 +161,10 @@ function value_and_pushforward_function(
     xs...,
 )
     return (ds) -> begin
-        @assert ds isa Tuple && length(ds) == length(xs)
+        if !(ds isa Tuple)
+            ds = (ds,)    
+        end
+        @assert length(ds) == length(xs)
         local value
         primalcalled = false
         if ab isa AbstractFiniteDifference
@@ -183,6 +179,7 @@ function value_and_pushforward_function(
             end
             return vs
         end, xs...)(ds)
+        
         return value, pf
     end
 end
@@ -268,10 +265,16 @@ function Base.:*(y, d::LazyDerivative)
 end
 
 function Base.:*(d::LazyDerivative, y::Union{Number,Tuple})
+    if y isa Tuple && d.xs isa Tuple
+        @assert length(y) == length(d.xs) 
+    end
     return derivative(d.backend, d.f, d.xs...) .* y
 end
 
 function Base.:*(y::Union{Number,Tuple}, d::LazyDerivative)
+    if y isa Tuple && d.xs isa Tuple
+        @assert length(y) == length(d.xs) 
+    end
     return y .* derivative(d.backend, d.f, d.xs...)
 end
 
@@ -293,6 +296,9 @@ Base.:*(d::LazyGradient, y) = gradient(d.backend, d.f, d.xs...) * y
 Base.:*(y, d::LazyGradient) = y * gradient(d.backend, d.f, d.xs...)
 
 function Base.:*(d::LazyGradient, y::Union{Number,Tuple})
+    if y isa Tuple && d.xs isa Tuple
+        @assert length(y) == length(d.xs) 
+    end
     if d.xs isa Tuple
         return gradient(d.backend, d.f, d.xs...) .* y
     else
@@ -301,6 +307,9 @@ function Base.:*(d::LazyGradient, y::Union{Number,Tuple})
 end
 
 function Base.:*(y::Union{Number,Tuple}, d::LazyGradient)
+    if y isa Tuple && d.xs isa Tuple
+        @assert length(y) == length(d.xs) 
+    end
     if d.xs isa Tuple
         return y .* gradient(d.backend, d.f, d.xs...)
     else
diff --git a/test/runtests.jl b/test/runtests.jl
index 05e9df6..97f9687 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,7 +14,7 @@ FDMBackend1() = FDMBackend1(central_fdm(5, 1))
 const fdm_backend1 = FDMBackend1()
 # Minimal interface
 AD.@primitive function jacobian(ab::FDMBackend1, f, xs...)
-    return jacobian(ab.alg, f, xs...)
+    return FDM.jacobian(ab.alg, f, xs...)
 end
 
 struct FDMBackend2{A} <: AD.AbstractFiniteDifference
@@ -24,7 +24,7 @@ FDMBackend2() = FDMBackend2(central_fdm(5, 1))
 const fdm_backend2 = FDMBackend2()
 AD.@primitive function pushforward_function(ab::FDMBackend2, f, xs...)
     return function (vs)
-        jvp(ab.alg, f, tuple.(xs, vs)...)
+        FDM.jvp(ab.alg, f, tuple.(xs, vs)...)
     end
 end
 
@@ -37,10 +37,10 @@ AD.@primitive function pullback_function(ab::FDMBackend3, f, xs...)
     return function (vs)
         # Supports only single output
         if vs isa AbstractVector
-            return j′vp(ab.alg, f, vs, xs...)
+            return FDM.j′vp(ab.alg, f, vs, xs...)
         else
             @assert length(vs) == 1
-            return j′vp(ab.alg, f, vs[1], xs...)
+            return FDM.j′vp(ab.alg, f, vs[1], xs...)
 
         end
     end
@@ -51,51 +51,23 @@ end
 ## ForwardDiff
 struct ForwardDiffBackend1 <: AD.AbstractForwardMode end
 const forwarddiff_backend1 = ForwardDiffBackend1()
-AD.@primitive function jacobian(ab::ForwardDiffBackend1, f, xs...)
-    if xs isa AbstractArray
-        return ForwardDiff.jacobian(f, xs)
-    elseif xs isa Number
-        return ForwardDiff.derivative(f, xs)
-    elseif xs isa Tuple
-        @assert length(xs) <= 2
-        if eltype(xs) <: Number
-            # Can this be avoided for the derivative computation? It seems to defeat the purpose of AbstractDifferentiation. 
-            # If eltype(xs) isa Number, ForwardDiff will error by saying that Jacobian expects an array.
-            # if we convert the number to an array in AbstractDifferentiation, we get a no method error due to log(x) and exp(x)
-            if length(xs) == 1
-                (ForwardDiff.derivative(f, xs[1]),)
-            else
-                _f1 = x -> f(x,xs[2])
-                _f2 = x -> f(xs[1], x)
-                (ForwardDiff.derivative(_f1, xs[1]),ForwardDiff.derivative(_f2, xs[2]))
-            end      
+AD.@primitive function jacobian(ab::ForwardDiffBackend1, f, xs)
+    if xs isa Number
+        return (ForwardDiff.derivative(f, xs),)
+    elseif xs isa AbstractArray
+        out = f(xs)
+        if out isa Number
+            return (adjoint(ForwardDiff.gradient(f, xs)),)
         else
-            if length(xs) == 1
-                out = f(xs[1])
-                if out isa Number
-                    # Lazy Jacobian test pb fails otherwise with "dimensions must match: a has dims (Base.OneTo(1), Base.OneTo(5)), b has dims (Base.OneTo(5),), mismatch at 1"
-                    (reshape(ForwardDiff.gradient(f, xs[1]),(1,length(xs[1]))),)
-                else
-                    (ForwardDiff.jacobian(f, xs[1]),)
-                end
-            else
-                out = f(xs...)
-                _f1 = x -> f(x,xs[2])
-                _f2 = x -> f(xs[1], x)
-                if out isa Number
-                    # reshape for pullback tests (similar as above)
-                    (reshape(ForwardDiff.gradient(_f1, xs[1]),(1,length(xs[1]))),
-                    reshape(ForwardDiff.gradient(_f2, xs[2]), (1, length(xs[2]))))
-                else
-                    (ForwardDiff.jacobian(_f1, xs[1]),ForwardDiff.jacobian(_f2, xs[2]))
-                end
-                
-            end    
-        end 
+            return (ForwardDiff.jacobian(f, xs),)
+        end
+    elseif xs isa Tuple
+        error(typeof(xs))      
     else
         error(typeof(xs)) 
     end
 end
+AD.primalvalue(::ForwardDiffBackend1, ::Any, f, xs) = ForwardDiff.value.(f(xs...))
 
 struct ForwardDiffBackend2 <: AD.AbstractForwardMode end
 const forwarddiff_backend2 = ForwardDiffBackend2()
@@ -114,6 +86,8 @@ AD.@primitive function pushforward_function(ab::ForwardDiffBackend2, f, xs...)
         end
     end
 end
+AD.primalvalue(::ForwardDiffBackend2, ::Any, f, xs) = ForwardDiff.value.(f(xs...))
+
 ##
 
 
@@ -128,7 +102,7 @@ dfgraddxdx(x, y) = prod(x)./(x*x') - Diagonal(diag(prod(x)./(x*x')))
 dfgraddydy(x, y) = zeros(length(y),length(y))
 
 function fjac(x, y)
-    x + Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
+    x + -3*y + [y[2:end];zero(y[end])]/2# Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U) * y
 end
 dfjacdx(x, y) = I(length(x))
 dfjacdy(x, y) = Bidiagonal(-ones(length(y)) * 3, ones(length(y) - 1) / 2, :U)
@@ -152,171 +126,177 @@ xvec2 = deepcopy(xvec)
 yvec2 = deepcopy(yvec)
 
 
-function test_fdm_derivatives(backend, fdm_backend)
-    # fdm_backend for comparison with finite differences 
-    der1 = AD.derivative(backend, fder, xscalar, yscalar)
-    der2 = (
-        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
-        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
-    )
-    @test minimum(isapprox.(der1, der2, rtol=1e-10))
-    valscalar, der3 = AD.value_and_derivative(backend, fder, xscalar, yscalar)
-    @test valscalar == fder(xscalar, yscalar)
-    @test der3 .- der1 == (0, 0)
+function test_derivatives(backend; multiple_inputs=true)
+    # test with respect to analytical solution
     der_exact = (dfderdx(xscalar,yscalar), dfderdy(xscalar,yscalar))
-    @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
+    if multiple_inputs
+        der1 = AD.derivative(backend, fder, xscalar, yscalar)
+        @test minimum(isapprox.(der_exact, der1, rtol=1e-10))
+        valscalar, der2 = AD.value_and_derivative(backend, fder, xscalar, yscalar)
+        @test valscalar == fder(xscalar, yscalar)
+        @test der2 .- der1 == (0, 0)
+    end
     # test if single input (no tuple works)
     valscalara, dera = AD.value_and_derivative(backend, x -> fder(x, yscalar), xscalar)
     valscalarb, derb = AD.value_and_derivative(backend, y -> fder(xscalar, y), yscalar)
-    @test valscalar == valscalara
-    @test valscalar == valscalarb
-    @test isapprox(dera[1], der1[1], rtol=1e-10)
-    @test isapprox(derb[1], der1[2], rtol=1e-10)
+    @test fder(xscalar, yscalar) == valscalara
+    @test fder(xscalar, yscalar) == valscalarb
+    @test isapprox(dera[1], der_exact[1], rtol=1e-10)
+    @test isapprox(derb[1], der_exact[2], rtol=1e-10)
 end
 
-function test_fdm_gradients(backend, fdm_backend)
-    grad1 = AD.gradient(backend, fgrad, xvec, yvec)
-    grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
-    @test minimum(isapprox.(grad1, grad2, rtol=1e-10))
-    valscalar, grad3 = AD.value_and_gradient(backend, fgrad, xvec, yvec)
-    @test valscalar == fgrad(xvec, yvec)
-    @test norm.(grad3 .- grad1) == (0, 0)
+function test_gradients(backend; multiple_inputs=true)
+    # test with respect to analytical solution
     grad_exact = (dfgraddx(xvec,yvec), dfgraddy(xvec,yvec))
-    @test minimum(isapprox.(grad_exact, grad1, rtol=1e-10))
-    @test xvec == xvec2
-    @test yvec == yvec2
+    if multiple_inputs
+        grad1 = AD.gradient(backend, fgrad, xvec, yvec)
+        @test minimum(isapprox.(grad_exact, grad1, rtol=1e-10))
+        valscalar, grad2 = AD.value_and_gradient(backend, fgrad, xvec, yvec)
+        @test valscalar == fgrad(xvec, yvec)
+        @test norm.(grad2 .- grad1) == (0, 0)
+    end
     # test if single input (no tuple works)
     valscalara, grada = AD.value_and_gradient(backend, x -> fgrad(x, yvec), xvec)
     valscalarb, gradb = AD.value_and_gradient(backend, y -> fgrad(xvec, y), yvec)
-    @test valscalar == valscalara
-    @test valscalar == valscalarb
-    @test isapprox(grada[1], grad1[1], rtol=1e-10)
-    @test isapprox(gradb[1], grad1[2], rtol=1e-10)
-end
-
-function test_fdm_jacobians(backend,fdm_backend)
-    jac1 = AD.jacobian(backend, fjac, xvec, yvec)
-    jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
-    @test  minimum(isapprox.(jac1, jac2, rtol=1e-10))
-    valvec, jac3 = AD.value_and_jacobian(backend, fjac, xvec, yvec)
-    @test valvec == fjac(xvec, yvec)
-    @test norm.(jac3 .- jac1) == (0, 0)
-    grad_exact = (dfjacdx(xvec, yvec), dfjacdy(xvec, yvec))
-    @test minimum(isapprox.(grad_exact, jac1, rtol=1e-10))
+    @test fgrad(xvec, yvec) == valscalara
+    @test fgrad(xvec, yvec) == valscalarb
+    @test isapprox(grada[1], grad_exact[1], rtol=1e-10)
+    @test isapprox(gradb[1], grad_exact[2], rtol=1e-10)
     @test xvec == xvec2
     @test yvec == yvec2
+end
+
+function test_jacobians(backend; multiple_inputs=true)
+    # test with respect to analytical solution
+    jac_exact = (dfjacdx(xvec, yvec), dfjacdy(xvec, yvec))
+    if multiple_inputs
+        jac1 = AD.jacobian(backend, fjac, xvec, yvec)
+        @test minimum(isapprox.(jac_exact, jac1, rtol=1e-10))
+        valvec, jac2 = AD.value_and_jacobian(backend, fjac, xvec, yvec)
+        @test valvec == fjac(xvec, yvec)
+        @test norm.(jac2 .- jac1) == (0, 0)
+    end
+    
     # test if single input (no tuple works)
     valveca, jaca = AD.value_and_jacobian(backend, x -> fjac(x, yvec), xvec)
     valvecb, jacb = AD.value_and_jacobian(backend, y -> fjac(xvec, y), yvec)
-    @test valvec == valveca
-    @test valvec == valvecb
-    @test isapprox(jaca[1], jac1[1], rtol=1e-10)
-    @test isapprox(jacb[1], jac1[2], rtol=1e-10)
+    @test fjac(xvec, yvec) == valveca
+    @test fjac(xvec, yvec) == valvecb
+    @test isapprox(jaca[1], jac_exact[1], rtol=1e-10)
+    @test isapprox(jacb[1], jac_exact[2], rtol=1e-10)
+    @test xvec == xvec2
+    @test yvec == yvec2
 end
 
-function test_fdm_hessians(backend, fdm_backend)
-    H1 = AD.hessian(backend, fgrad, xvec, yvec)
-    @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10
-    @test dfgraddydy(xvec,yvec) ≈ H1[2] atol=1e-10
+function test_hessians(backend; multiple_inputs=false)
+    if multiple_inputs
+        # ... but 
+        error("multiple_inputs=true is not supported.")
+    else
+        # explicit test that AbstractDifferentiation throws an error
+        # don't support tuple of Hessians
+        @test_throws AssertionError H1 = AD.hessian(backend, fgrad, xvec, yvec)
+    end
+   
+    # @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10
+    # @test dfgraddydy(xvec,yvec) ≈ H1[2] atol=1e-10
 
     # test if single input (no tuple works)
     fhess = x -> fgrad(x, yvec)
     hess1 = AD.hessian(backend, fhess, xvec)
-    hess2 = FDM.jacobian(
-        fdm_backend.alg,
-        (x) -> begin
-            FDM.grad(
-                fdm_backend.alg,
-                fhess,
-                x,
-            )
-        end,
-        xvec,
-    )
-    @test minimum(isapprox.(hess1, hess2, rtol=1e-10))
-    valscalar, hess3 = AD.value_and_hessian(backend, fhess, xvec)
+    # test with respect to analytical solution
+    @test dfgraddxdx(xvec,yvec) ≈ hess1[1] atol=1e-10
+    
+    valscalar, hess2 = AD.value_and_hessian(backend, fhess, xvec)
     @test valscalar == fgrad(xvec, yvec)
-    @test norm.(hess3 .- hess1) == (0,)
-    valscalar, grad, hess4 = AD.value_gradient_and_hessian(backend, fhess, xvec)
+    @test norm.(hess2 .- hess1) == (0,)
+    valscalar, grad, hess3 = AD.value_gradient_and_hessian(backend, fhess, xvec)
     @test valscalar == fgrad(xvec, yvec)
     @test norm.(grad .- AD.gradient(backend, fhess, xvec)) == (0,)
-    @test norm.(hess4 .- hess1) == (0,)
-    @test dfgraddxdx(xvec,yvec) ≈ hess1[1] atol=1e-10
+    @test norm.(hess3 .- hess1) == (0,)
+    
     @test xvec == xvec2
     @test yvec == yvec2
     fhess2 = x-> dfgraddx(x, yvec)
-    hess5 = AD.jacobian(backend, fhess2, xvec)
-    @test minimum(isapprox.(hess5, hess1, atol=1e-10))
+    hess4 = AD.jacobian(backend, fhess2, xvec)
+    @test minimum(isapprox.(hess4, hess1, atol=1e-10))
 end
 
-function test_fdm_jvp(backend,fdm_backend)
+function test_jvp(backend; multiple_inputs=true)
     v = (rand(length(xvec)), rand(length(yvec)))
 
-    if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
-        identity_like = AD.identity_matrix_like(v)
-        vaug = map(identity_like) do identity_like_i
-            identity_like_i .* v
-        end
+    if multiple_inputs
+        if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
+            identity_like = AD.identity_matrix_like(v)
+            vaug = map(identity_like) do identity_like_i
+                identity_like_i .* v
+            end
 
-        pf1 = map(v->AD.pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
-        ((valvec1, pf3x), (valvec2, pf3y)) = map(v->AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
-    else
-        pf1 = AD.pushforward_function(backend, fjac, xvec, yvec)(v)
-        valvec, pf3 = AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v)
-        ((valvec1, pf3x), (valvec2, pf3y)) = (valvec, pf3[1]), (valvec, pf3[2])
+            pf1 = map(v->AD.pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
+            ((valvec1, pf2x), (valvec2, pf2y)) = map(v->AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v), vaug)
+        else
+            pf1 = AD.pushforward_function(backend, fjac, xvec, yvec)(v)
+            valvec, pf2 = AD.value_and_pushforward_function(backend, fjac, xvec, yvec)(v) 
+            ((valvec1, pf2x), (valvec2, pf2y)) = (valvec, pf2[1]), (valvec, pf2[2])
+        end
+       
+        @test valvec1 == fjac(xvec, yvec)
+        @test valvec2 == fjac(xvec, yvec)
+        @test norm.((pf2x,pf2y) .- pf1) == (0, 0)
+        # test with respect to analytical solution
+        @test minimum(isapprox.(pf1, (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
+        @test xvec == xvec2
+        @test yvec == yvec2
     end
-    pf2 = (
-        FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
-        FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
-    )
-    @test minimum(isapprox.(pf1, pf2, rtol=1e-10))
 
+    valvec1, pf1 = AD.value_and_pushforward_function(backend, x -> fjac(x, yvec), xvec)(v[1])
+    valvec2, pf2 = AD.value_and_pushforward_function(backend, y -> fjac(xvec, y), yvec)(v[2])
+
+    if backend isa Union{FDMBackend2}
+      pf1 = (pf1,)  
+      pf2 = (pf2,)
+    end
     @test valvec1 == fjac(xvec, yvec)
     @test valvec2 == fjac(xvec, yvec)
-    @test norm.((pf3x,pf3y) .- pf1) == (0, 0)
-    @test minimum(isapprox.(pf1, (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
-    @test xvec == xvec2
-    @test yvec == yvec2
+    @test minimum(isapprox.((pf1[1],pf2[1]), (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2])), atol=1e-10))
 end
 
-function test_fdm_j′vp(backend,fdm_backend)
+function test_j′vp(backend; multiple_inputs=true)
+    # test with respect to analytical solution
     w = rand(length(fjac(xvec, yvec)))
-    pb1 = AD.pullback_function(backend, fjac, xvec, yvec)(w)
-    pb2 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
-    @test all(norm.(pb1 .- pb2) .<= (1e-10, 1e-10))
-    valvec, pb3 = AD.value_and_pullback_function(backend, fjac, xvec, yvec)(w)
-    @test valvec == fjac(xvec, yvec)
-    @test norm.(pb3 .- pb1) == (0, 0)
-    @test minimum(isapprox.(pb1, (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
-    @test xvec == xvec2
-    @test yvec == yvec2
+    if multiple_inputs
+        pb1 = AD.pullback_function(backend, fjac, xvec, yvec)(w)
+        valvec, pb2 = AD.value_and_pullback_function(backend, fjac, xvec, yvec)(w)
+        @test valvec == fjac(xvec, yvec)
+        @test norm.(pb2 .- pb1) == (0, 0)
+        @test minimum(isapprox.(pb1, (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
+        @test xvec == xvec2
+        @test yvec == yvec2
+    end
+
+    valvec1, pb1 = AD.value_and_pullback_function(backend, x -> fjac(x, yvec), xvec)(w)
+    valvec2, pb2 = AD.value_and_pullback_function(backend, y -> fjac(xvec, y), yvec)(w)
+    @test valvec1 == fjac(xvec, yvec)
+    @test valvec2 == fjac(xvec, yvec)
+    @test minimum(isapprox.((pb1[1],pb2[1]), (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w)), atol=1e-10))
 end
 
-function test_fdm_lazy_derivatives(backend,fdm_backend)
+function test_lazy_derivatives(backend; multiple_inputs=true)
     # single input function
     der1 = AD.derivative(backend, x->fder(x, yscalar), xscalar)
-    der2 = (
-        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
-        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
-    )
-
     lazyder = AD.LazyDerivative(backend, x->fder(x, yscalar), xscalar)
 
     # multiplication with scalar
-    @test isapprox(der1[1]*yscalar, der2[1]*yscalar, atol=1e-10) 
     @test lazyder*yscalar == der1.*yscalar
     @test lazyder*yscalar isa Tuple
 
-    @test isapprox(yscalar*der1[1], yscalar*der2[1], atol=1e-10)
     @test yscalar*lazyder == yscalar.*der1 
     @test yscalar*lazyder isa Tuple
 
     # multiplication with array
-    @test isapprox(der1[1]*yvec, der2[1]*yvec, atol=1e-10)
     @test lazyder*yvec == (der1.*yvec,)
     @test lazyder*yvec isa Tuple
 
-    @test isapprox(yvec*der1[1], yvec*der2[1], atol=1e-10)
     @test yvec*lazyder == (yvec.*der1,)
     @test yvec*lazyder isa Tuple
 
@@ -328,52 +308,42 @@ function test_fdm_lazy_derivatives(backend,fdm_backend)
     @test (yvec,)*lazyder == yvec*lazyder
 
     # two input function
-    der1 = AD.derivative(backend, fder, xscalar, yscalar)
-    der2 = (
-        fdm_backend.alg(x -> fder(x, yscalar), xscalar),
-        fdm_backend.alg(y -> fder(xscalar, y), yscalar),
-    )
+    if multiple_inputs
+        der1 = AD.derivative(backend, fder, xscalar, yscalar)
+        lazyder = AD.LazyDerivative(backend, fder, (xscalar, yscalar))
 
-    lazyder = AD.LazyDerivative(backend, fder, (xscalar, yscalar))
+        # multiplication with scalar
+        @test lazyder*yscalar == der1.*yscalar
+        @test lazyder*yscalar isa Tuple
 
-    # multiplication with scalar
-    @test minimum(isapprox.(der1.*yscalar, der2.*yscalar, atol=1e-10))
-    @test lazyder*yscalar == der1.*yscalar
-    @test lazyder*yscalar isa Tuple
+        @test yscalar*lazyder == yscalar.*der1
+        @test yscalar*lazyder isa Tuple
 
-    @test minimum(isapprox.(yscalar.*der1, yscalar.*der2, atol=1e-10)) 
-    @test yscalar*lazyder == yscalar.*der1
-    @test yscalar*lazyder isa Tuple
+        # multiplication with array
+        @test lazyder*yvec == (der1[1]*yvec, der1[2]*yvec)
+        @test lazyder*yvec isa Tuple
 
-    # multiplication with array
-    @test minimum(isapprox.((der1[1]*yvec, der1[2]*yvec), (der2[1]*yvec, der2[2]*yvec), atol=1e-10))
-    @test lazyder*yvec == (der1[1]*yvec, der1[2]*yvec)
-    @test lazyder*yvec isa Tuple
+        @test yvec*lazyder == (yvec*der1[1], yvec*der1[2])
+        @test lazyder*yvec isa Tuple
 
-    @test minimum(isapprox.((yvec*der1[1], yvec*der1[2]), (yvec*der2[1], yvec*der2[2]), atol=1e-10))
-    @test yvec*lazyder == (yvec*der1[1], yvec*der1[2])
-    @test lazyder*yvec isa Tuple
+        # multiplication with tuple
+        @test_throws AssertionError lazyder*(yscalar,)
+        @test_throws AssertionError lazyder*(yvec,)
 
-    # multiplication with tuple
-    @test lazyder*(yscalar,) == lazyder*yscalar
-    @test lazyder*(yvec,) == lazyder*yvec
-
-    @test (yscalar,)*lazyder == yscalar*lazyder
-    @test (yvec,)*lazyder == yvec*lazyder
+        @test_throws AssertionError (yscalar,)*lazyder 
+        @test_throws AssertionError (yvec,)*lazyder
+    end
 end
 
-function test_fdm_lazy_gradients(backend,fdm_backend)
+function test_lazy_gradients(backend; multiple_inputs=true)
     # single input function
     grad1 = AD.gradient(backend, x->fgrad(x, yvec), xvec)
-    grad2 = FDM.grad(fdm_backend.alg, x->fgrad(x, yvec), xvec)
     lazygrad = AD.LazyGradient(backend, x->fgrad(x, yvec), xvec)
 
     # multiplication with scalar
-    @test minimum(isapprox.(grad1.*yscalar, grad2.*yscalar, atol=1e-10))
     @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,)
     @test lazygrad*yscalar isa Tuple
 
-    @test minimum(isapprox.(yscalar.*grad1, yscalar.*grad2, atol=1e-10))
     @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,)
     @test yscalar*lazygrad isa Tuple
 
@@ -382,96 +352,87 @@ function test_fdm_lazy_gradients(backend,fdm_backend)
     @test (yscalar,)*lazygrad == yscalar*lazygrad
 
     # two input function
-    grad1 = AD.gradient(backend, fgrad, xvec, yvec)
-    grad2 = FDM.grad(fdm_backend.alg, fgrad, xvec, yvec)
-    lazygrad = AD.LazyGradient(backend, fgrad, (xvec, yvec))
+    if multiple_inputs
+        grad1 = AD.gradient(backend, fgrad, xvec, yvec)
+        lazygrad = AD.LazyGradient(backend, fgrad, (xvec, yvec))
 
-    # multiplication with scalar
-    @test minimum(isapprox.(grad1.*yscalar, grad2.*yscalar,  atol=1e-10))
-    @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,0)
-    @test lazygrad*yscalar isa Tuple
+        # multiplication with scalar
+        @test norm.(lazygrad*yscalar .- grad1.*yscalar) == (0,0)
+        @test lazygrad*yscalar isa Tuple
 
-    @test minimum(isapprox.(yscalar.*grad1, yscalar.*grad2, atol=1e-10))
-    @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,0)
-    @test yscalar*lazygrad isa Tuple
+        @test norm.(yscalar*lazygrad .- yscalar.*grad1) == (0,0)
+        @test yscalar*lazygrad isa Tuple
 
-    # multiplication with tuple
-    @test lazygrad*(yscalar,) == lazygrad*yscalar
-    @test (yscalar,)*lazygrad == yscalar*lazygrad
+        # multiplication with tuple
+        @test_throws AssertionError lazygrad*(yscalar,) == lazygrad*yscalar
+        @test_throws AssertionError (yscalar,)*lazygrad == yscalar*lazygrad
+    end
 end
 
-function test_fdm_lazy_jacobians(backend,fdm_backend)
+function test_lazy_jacobians(backend; multiple_inputs=true)
     # single input function
     jac1 = AD.jacobian(backend, x->fjac(x, yvec), xvec)
-    jac2 = FDM.jacobian(fdm_backend.alg, x->fjac(x, yvec), xvec)
     lazyjac = AD.LazyJacobian(backend, x->fjac(x, yvec), xvec)
 
     # multiplication with scalar
-    @test minimum(isapprox.(jac1.*yscalar, jac2.*yscalar, atol=1e-10))
     @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,)
     @test lazyjac*yscalar isa Tuple
 
-    @test minimum(isapprox.(yscalar.*jac1, yscalar.*jac2, atol=1e-10))
     @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,)
     @test yscalar*lazyjac isa Tuple
 
-    w = adjoint(rand(length(fjac(xvec, yvec))))
+    w = rand(length(fjac(xvec, yvec)))
     v = (rand(length(xvec)),rand(length(xvec)))
 
     # vjp
-    pb1 = FDM.j′vp(fdm_backend.alg, x -> fjac(x, yvec), w, xvec)
-    res = w*lazyjac
+    pb1 = (vJxp(xvec,yvec,w),)
+    res = w'*lazyjac
     @test minimum(isapprox.(pb1, res, atol=1e-10))
     @test res isa Tuple
 
     # jvp
-    pf1 = (FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),)
+    pf1 = (jxvp(xvec,yvec,v[1]),)
     res = lazyjac*v[1]
     @test minimum(isapprox.(pf1, res, atol=1e-10))
     @test res isa Tuple
 
     # two input function
-    jac1 = AD.jacobian(backend, fjac, xvec, yvec)
-    jac2 = FDM.jacobian(fdm_backend.alg, fjac, xvec, yvec)
-    lazyjac = AD.LazyJacobian(backend, fjac, (xvec, yvec))
-
-    # multiplication with scalar
-    @test minimum(isapprox.(jac1.*yscalar, jac2.*yscalar, atol=1e-10))
-    @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,0)
-    @test lazyjac*yscalar isa Tuple
-
-    @test minimum(isapprox.(yscalar.*jac1, yscalar.*jac2, atol=1e-10))
-    @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,0)
-    @test yscalar*lazyjac isa Tuple
-
-    # vjp
-    pb1 = FDM.j′vp(fdm_backend.alg, fjac, w, xvec, yvec)
-    res = w*lazyjac
-    @test minimum(isapprox.(pb1, res, atol=1e-10))
-    @test res isa Tuple
+    if multiple_inputs
+        jac1 = AD.jacobian(backend, fjac, xvec, yvec)
+        lazyjac = AD.LazyJacobian(backend, fjac, (xvec, yvec))
+
+        # multiplication with scalar
+        @test norm.(lazyjac*yscalar .- jac1.*yscalar) == (0,0)
+        @test lazyjac*yscalar isa Tuple
+
+        @test norm.(yscalar*lazyjac .- yscalar.*jac1) == (0,0)
+        @test yscalar*lazyjac isa Tuple
+
+        # vjp
+        pb1 = (vJxp(xvec,yvec,w), vJyp(xvec,yvec,w))
+        res = w'lazyjac
+        @test minimum(isapprox.(pb1, res, atol=1e-10))
+        @test res isa Tuple
+
+        # jvp
+        pf1 = (jxvp(xvec,yvec,v[1]), jyvp(xvec,yvec,v[2]))
+
+        if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
+            identity_like = AD.identity_matrix_like(v)
+            vaug = map(identity_like) do identity_like_i
+                identity_like_i .* v
+            end
 
-    # jvp
-    pf1 = (
-        FDM.jvp(fdm_backend.alg, x -> fjac(x, yvec), (xvec, v[1])),
-        FDM.jvp(fdm_backend.alg, y -> fjac(xvec, y), (yvec, v[2])),
-    )
-
-    if backend isa Union{FDMBackend2,ForwardDiffBackend2} # augmented version of v
-        identity_like = AD.identity_matrix_like(v)
-        vaug = map(identity_like) do identity_like_i
-            identity_like_i .* v
+            res = map(v->(lazyjac*v)[1], vaug)
+        else
+            res = lazyjac*v
         end
-
-        res = map(v->(lazyjac*v)[1], vaug)
-    else
-        res = lazyjac*v
+        @test minimum(isapprox.(pf1, res, atol=1e-10))
+        @test res isa Tuple
     end
-
-    @test minimum(isapprox.(pf1, res, atol=1e-10))
-    @test res isa Tuple
 end
 
-function test_fdm_lazy_hessians(backend,fdm_backend)
+function test_lazy_hessians(backend; multiple_inputs=true)
     # fdm_backend not used here yet..
     # single input function
     fhess = x -> fgrad(x, yvec)
@@ -486,7 +447,7 @@ function test_fdm_lazy_hessians(backend,fdm_backend)
     @test minimum(isapprox.(yscalar*lazyhess, yscalar.*hess1, atol=1e-10))
     @test yscalar*lazyhess isa Tuple
 
-    w = adjoint(rand(length(xvec)))
+    w = rand(length(xvec))
     v = rand(length(xvec))
 
     # Hvp
@@ -496,109 +457,106 @@ function test_fdm_lazy_hessians(backend,fdm_backend)
     @test res isa Tuple
 
     # H′vp
-    wH = map(h->h'*adjoint(w), hess1)
-    res = w*lazyhess
+    wH = map(h->h'*w, hess1)
+    res = w'*lazyhess
     @test minimum(isapprox.(wH, res, atol=1e-10))
     @test res isa Tuple
 end
 
 @testset "AbstractDifferentiation.jl" begin
-    testFDMbackend = fdm_backend1
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
-            test_fdm_derivatives(fdm_backend1,testFDMbackend)
-            test_fdm_derivatives(fdm_backend2,testFDMbackend)
-            test_fdm_derivatives(fdm_backend3,testFDMbackend)
+            test_derivatives(fdm_backend1)
+            test_derivatives(fdm_backend2)
+            test_derivatives(fdm_backend3)
         end
         @testset "Gradient" begin
-            test_fdm_gradients(fdm_backend1,testFDMbackend)
-            test_fdm_gradients(fdm_backend2,testFDMbackend)
-            test_fdm_gradients(fdm_backend3,testFDMbackend
-            )
+            test_gradients(fdm_backend1)
+            test_gradients(fdm_backend2)
+            test_gradients(fdm_backend3)
         end
         @testset "Jacobian" begin
-            test_fdm_jacobians(fdm_backend1,testFDMbackend)
-            test_fdm_jacobians(fdm_backend2,testFDMbackend)
-            test_fdm_jacobians(fdm_backend3,testFDMbackend)
+            test_jacobians(fdm_backend1)
+            test_jacobians(fdm_backend2)
+            test_jacobians(fdm_backend3)
         end
         @testset "Hessian" begin
-            # Works but super slow
-            test_fdm_hessians(fdm_backend1,testFDMbackend)
-            test_fdm_hessians(fdm_backend2,testFDMbackend)
-            test_fdm_hessians(fdm_backend3,testFDMbackend)
+            test_hessians(fdm_backend1)
+            test_hessians(fdm_backend2)
+            test_hessians(fdm_backend3)
         end
         @testset "jvp" begin
-            test_fdm_jvp(fdm_backend1,testFDMbackend)
-            test_fdm_jvp(fdm_backend2,testFDMbackend)
-            test_fdm_jvp(fdm_backend3,testFDMbackend)
+            test_jvp(fdm_backend1)
+            test_jvp(fdm_backend2)
+            test_jvp(fdm_backend3)
         end
         @testset "j′vp" begin
-            test_fdm_j′vp(fdm_backend1,testFDMbackend)
-            test_fdm_j′vp(fdm_backend2,testFDMbackend)
-            test_fdm_j′vp(fdm_backend3,testFDMbackend)
+            test_j′vp(fdm_backend1)
+            test_j′vp(fdm_backend2)
+            test_j′vp(fdm_backend3)
         end
         @testset "Lazy Derivative" begin
-            test_fdm_lazy_derivatives(fdm_backend1,testFDMbackend)
-            test_fdm_lazy_derivatives(fdm_backend2,testFDMbackend)
-            test_fdm_lazy_derivatives(fdm_backend3,testFDMbackend)
+            test_lazy_derivatives(fdm_backend1)
+            test_lazy_derivatives(fdm_backend2)
+            test_lazy_derivatives(fdm_backend3)
         end
         @testset "Lazy Gradient" begin
-            test_fdm_lazy_gradients(fdm_backend1,testFDMbackend)
-            test_fdm_lazy_gradients(fdm_backend2,testFDMbackend)
-            test_fdm_lazy_gradients(fdm_backend3,testFDMbackend)
+            test_lazy_gradients(fdm_backend1)
+            test_lazy_gradients(fdm_backend2)
+            test_lazy_gradients(fdm_backend3)
         end
         @testset "Lazy Jacobian" begin
-            test_fdm_lazy_jacobians(fdm_backend1,testFDMbackend)
-            test_fdm_lazy_jacobians(fdm_backend2,testFDMbackend)
-            test_fdm_lazy_jacobians(fdm_backend3,testFDMbackend)
+            test_lazy_jacobians(fdm_backend1)
+            test_lazy_jacobians(fdm_backend2)
+            test_lazy_jacobians(fdm_backend3)
         end
         @testset "Lazy Hessian" begin
-            test_fdm_lazy_hessians(fdm_backend1,testFDMbackend)
-            test_fdm_lazy_hessians(fdm_backend2,testFDMbackend)
-            test_fdm_lazy_hessians(fdm_backend3,testFDMbackend)
+            test_lazy_hessians(fdm_backend1)
+            test_lazy_hessians(fdm_backend2)
+            test_lazy_hessians(fdm_backend3)
         end
     end
     @testset "ForwardDiff" begin
         @testset "Derivative" begin
-            test_fdm_derivatives(forwarddiff_backend1,testFDMbackend)
-            test_fdm_derivatives(forwarddiff_backend2,testFDMbackend)
+            test_derivatives(forwarddiff_backend1; multiple_inputs=false)
+            test_derivatives(forwarddiff_backend2)
         end
         @testset "Gradient" begin
-            test_fdm_gradients(forwarddiff_backend1,testFDMbackend)
-            test_fdm_gradients(forwarddiff_backend2,testFDMbackend)
+            test_gradients(forwarddiff_backend1; multiple_inputs=false)
+            test_gradients(forwarddiff_backend2)
         end
         @testset "Jacobian" begin
-            test_fdm_jacobians(forwarddiff_backend1,testFDMbackend)
-            test_fdm_jacobians(forwarddiff_backend2,testFDMbackend)
+            test_jacobians(forwarddiff_backend1; multiple_inputs=false)
+            test_jacobians(forwarddiff_backend2)
         end
         @testset "Hessian" begin
-            # Fails due to setindex! in AbstractDifferentiation Hessian function
-            @test_broken test_fdm_hessians(forwarddiff_backend1,testFDMbackend)
-            @test_broken test_fdm_hessians(forwarddiff_backend2,testFDMbackend)
+            #test_hessians(forwarddiff_backend1; multiple_inputs=false)
+            #test_hessians(forwarddiff_backend2)
         end
         @testset "jvp" begin
-            test_fdm_jvp(forwarddiff_backend1,testFDMbackend)
-            test_fdm_jvp(forwarddiff_backend2,testFDMbackend)
+            test_jvp(forwarddiff_backend1; multiple_inputs=false)
+            test_jvp(forwarddiff_backend2)
         end
         @testset "j′vp" begin
-            test_fdm_j′vp(forwarddiff_backend1,testFDMbackend)
-            test_fdm_j′vp(forwarddiff_backend2,testFDMbackend)
+            test_j′vp(forwarddiff_backend1; multiple_inputs=false)
+            test_j′vp(forwarddiff_backend2)
         end
         @testset "Lazy Derivative" begin
-            test_fdm_lazy_derivatives(forwarddiff_backend1,testFDMbackend)
-            test_fdm_lazy_derivatives(forwarddiff_backend2,testFDMbackend)
+            test_lazy_derivatives(forwarddiff_backend1; multiple_inputs=false)
+            test_lazy_derivatives(forwarddiff_backend2)
         end
         @testset "Lazy Gradient" begin
-            test_fdm_lazy_gradients(forwarddiff_backend1,testFDMbackend)
-            test_fdm_lazy_gradients(forwarddiff_backend2,testFDMbackend)
+            test_lazy_gradients(forwarddiff_backend1; multiple_inputs=false)
+            test_lazy_gradients(forwarddiff_backend2)
         end
         @testset "Lazy Jacobian" begin
-            test_fdm_lazy_jacobians(forwarddiff_backend1,testFDMbackend)
-            test_fdm_lazy_jacobians(forwarddiff_backend2,testFDMbackend)
+            test_lazy_jacobians(forwarddiff_backend1; multiple_inputs=false)
+            test_lazy_jacobians(forwarddiff_backend2)
         end
         @testset "Lazy Hessian" begin
-            #@test_broken test_fdm_lazy_hessians(forwarddiff_backend1,testFDMbackend)
-            #@test_broken test_fdm_lazy_hessians(forwarddiff_backend2,testFDMbackend)
+            #test_lazy_hessians(forwarddiff_backend1; multiple_inputs=false)
+            #test_lazy_hessians(forwarddiff_backend2)
         end
     end
 end
+

From 0135125db9214bb9bd2ab47897a496c2fd115e45 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Tue, 3 Aug 2021 10:53:03 +0200
Subject: [PATCH 20/25] add Zygote tests

---
 Project.toml     |  3 ++-
 test/runtests.jl | 65 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0ca1a6e..3033ef9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,7 @@ FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "FiniteDifferences", "ForwardDiff", "Random"]
+test = ["Test", "FiniteDifferences", "ForwardDiff", "Random", "Zygote"]
diff --git a/test/runtests.jl b/test/runtests.jl
index 97f9687..d2ec67a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,7 @@
 using AbstractDifferentiation
 using Test, FiniteDifferences, LinearAlgebra
 using ForwardDiff
+using Zygote
 using Random
 Random.seed!(1234)
 
@@ -87,9 +88,33 @@ AD.@primitive function pushforward_function(ab::ForwardDiffBackend2, f, xs...)
     end
 end
 AD.primalvalue(::ForwardDiffBackend2, ::Any, f, xs) = ForwardDiff.value.(f(xs...))
+##
 
+## Zygote
+struct ZygoteBackend1 <: AD.AbstractReverseMode end
+const zygote_backend1 = ZygoteBackend1()
+AD.@primitive function pullback_function(ab::ZygoteBackend1, f, xs...)
+    return function (vs)
+        # Supports only single output
+        _, back = Zygote.pullback(f, xs...)
+        if vs isa AbstractVector
+            back(vs)
+        else
+            @assert length(vs) == 1
+            back(vs[1])
+        end
+    end
+end
 ##
 
+struct ZygoteBackend <: AD.AbstractReverseMode end
+AD.@primitive function pullback_function(ab::ZygoteBackend, f, xs...)
+    return function (vs)
+        # Supports only single output and tuple as input
+        _, back = Zygote.pullback(f, xs...)
+        back(vs[1])
+    end
+end
 
 fder(x, y) = exp(y) * x + y * log(x)
 dfderdx(x, y) = exp(y) + y * 1/x
@@ -530,8 +555,8 @@ end
             test_jacobians(forwarddiff_backend2)
         end
         @testset "Hessian" begin
-            #test_hessians(forwarddiff_backend1; multiple_inputs=false)
-            #test_hessians(forwarddiff_backend2)
+            test_hessians(forwarddiff_backend1; multiple_inputs=false)
+            test_hessians(forwarddiff_backend2)
         end
         @testset "jvp" begin
             test_jvp(forwarddiff_backend1; multiple_inputs=false)
@@ -554,8 +579,40 @@ end
             test_lazy_jacobians(forwarddiff_backend2)
         end
         @testset "Lazy Hessian" begin
-            #test_lazy_hessians(forwarddiff_backend1; multiple_inputs=false)
-            #test_lazy_hessians(forwarddiff_backend2)
+            test_lazy_hessians(forwarddiff_backend1; multiple_inputs=false)
+            test_lazy_hessians(forwarddiff_backend2)
+        end
+    end
+    @testset "Zygote" begin
+        @testset "Derivative" begin
+            test_derivatives(zygote_backend1)
+        end
+        @testset "Gradient" begin
+            test_gradients(zygote_backend1)
+        end
+        @testset "Jacobian" begin
+            test_jacobians(zygote_backend1)
+        end
+        @testset "Hessian" begin
+            test_hessians(zygote_backend1)
+        end
+        @testset "jvp" begin
+            test_jvp(zygote_backend1)
+        end
+        @testset "j′vp" begin
+            test_j′vp(zygote_backend1)
+        end
+        @testset "Lazy Derivative" begin
+            test_lazy_derivatives(zygote_backend1)
+        end
+        @testset "Lazy Gradient" begin
+            test_lazy_gradients(zygote_backend1)
+        end
+        @testset "Lazy Jacobian" begin
+            test_lazy_jacobians(zygote_backend1)
+        end
+        @testset "Lazy Hessian" begin
+            test_lazy_hessians(zygote_backend1)
         end
     end
 end

From 28da613bd379e67ddfb1eb357dbe6a55adca6c0f Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 4 Aug 2021 14:16:14 +0200
Subject: [PATCH 21/25] fix ForwardDiff

---
 src/AbstractDifferentiation.jl | 38 +++++++++++++++++++++-------------
 test/runtests.jl               | 10 ---------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 643d60f..a5673a9 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -47,10 +47,11 @@ function hessian(ab::AbstractBackend, f, xs...)
     if xs isa Tuple
         # only support computation of Hessian for functions with single input argument
         @assert length(xs) == 1
+        xs = xs[1]
     end
-    return jacobian(secondlowest(ab), (xs...,) -> begin
-        gradient(lowest(ab), f, xs...)
-    end, xs...)
+    return jacobian(secondlowest(ab), xs -> begin
+        gradient(lowest(ab), f, xs)[1] # gradient returns a tuple
+    end, xs)
 end
 
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
@@ -92,7 +93,7 @@ function value_and_hessian(ab::AbstractBackend, f, xs...)
             value = primalvalue(ab, v, f, xs)
             primalcalled = true
         end
-        return g
+        return g[1] # gradient returns a tuple
     end, xs...)
     return value, hess
 end
@@ -105,7 +106,7 @@ function value_and_hessian(ab::HigherOrderBackend, f, xs...)
             value = primalvalue(ab, v, f, xs)
             primalcalled = true
         end
-        return g
+        return g[1]  # gradient returns a tuple
     end, xs...)
     return value, hess
 end
@@ -118,9 +119,9 @@ function value_gradient_and_hessian(ab::AbstractBackend, f, xs...)
             value = primalvalue(secondlowest(ab), v, f, xs)
             primalcalled = true
         end
-        return g
+        return g[1] # gradient returns a tuple
     end, xs...)
-    return value, grads, hess
+    return value, (grads,) , hess
 end
 function value_gradient_and_hessian(ab::HigherOrderBackend, f, xs...)
     local value
@@ -131,9 +132,9 @@ function value_gradient_and_hessian(ab::HigherOrderBackend, f, xs...)
             value = primalvalue(secondlowest(ab), v, f, xs)
             primalcalled = true
         end
-        return g
+        return g[1] # gradient returns a tuple
     end, xs...)
-    return value, grads, hess
+    return value, (grads,) , hess
 end
 
 function pushforward_function(
@@ -244,7 +245,11 @@ function value_and_pullback_function(
                 value = primalvalue(lowest(ab), vs, f, xs)
                 primalcalled = true
             end
-            return vs
+            if vs isa Tuple
+                return vs[1]
+            else
+                return vs
+            end
         end, xs...)(ws)
         return value, pb
     end
@@ -382,13 +387,18 @@ function Base.:*(d::LazyHessian, ys)
     end
 
     if d.xs isa Tuple
-        return pushforward_function(
+        res =  pushforward_function(
             secondlowest(d.backend),
-            (xs...,) -> gradient(lowest(d.backend), d.f, xs...), d.xs...,)(ys)
+            (xs...,) -> gradient(lowest(d.backend), d.f, xs...)[1], d.xs...,)(ys)  # [1] because gradient returns a tuple
     else
-        return pushforward_function(
+        res =  pushforward_function(
             secondlowest(d.backend),
-            (xs,) -> gradient(lowest(d.backend), d.f, xs),d.xs,)(ys)
+            (xs,) -> gradient(lowest(d.backend), d.f, xs)[1],d.xs,)(ys)  # gradient returns a tuple
+    end
+    if res isa Tuple
+        return res
+    else
+        return (res,)
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index d2ec67a..8432230 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -107,15 +107,6 @@ AD.@primitive function pullback_function(ab::ZygoteBackend1, f, xs...)
 end
 ##
 
-struct ZygoteBackend <: AD.AbstractReverseMode end
-AD.@primitive function pullback_function(ab::ZygoteBackend, f, xs...)
-    return function (vs)
-        # Supports only single output and tuple as input
-        _, back = Zygote.pullback(f, xs...)
-        back(vs[1])
-    end
-end
-
 fder(x, y) = exp(y) * x + y * log(x)
 dfderdx(x, y) = exp(y) + y * 1/x
 dfderdy(x, y) = exp(y) * x + log(x)
@@ -616,4 +607,3 @@ end
         end
     end
 end
-

From 9d0f27e65a920a5a922ddbfb6c3595ea27c4f9ef Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 4 Aug 2021 15:48:57 +0200
Subject: [PATCH 22/25] use Higher order backend

---
 src/AbstractDifferentiation.jl | 10 +++++++++-
 test/runtests.jl               | 30 ++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index a5673a9..1ad7fd5 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -16,7 +16,11 @@ struct HigherOrderBackend{B} <: AbstractBackend
 end
 reduceorder(b::AbstractBackend) = b
 function reduceorder(b::HigherOrderBackend)
-    return HigherOrderBackend(reverse(Base.tail(reverse(b.backends))))
+    if length(b.backends)==1 
+        return lowest(b) # prevent zero tuple and subsequent error when reducing over HigherOrderBackend
+    else
+        return HigherOrderBackend(reverse(Base.tail(reverse(b.backends))))
+    end
 end
 lowest(b::AbstractBackend) = b
 lowest(b::HigherOrderBackend) = b.backends[end]
@@ -43,6 +47,10 @@ function gradient(ab::AbstractBackend, f, xs...)
     return reshape.(adjoint.(jacobian(lowest(ab), f, xs...)),size.(xs))
 end
 function jacobian(ab::AbstractBackend, f, xs...) end
+function jacobian(ab::HigherOrderBackend, f, xs...) 
+    jacobian(lowest(ab), f, xs...)
+end
+
 function hessian(ab::AbstractBackend, f, xs...)
     if xs isa Tuple
         # only support computation of Hessian for functions with single input argument
diff --git a/test/runtests.jl b/test/runtests.jl
index 8432230..aba4c0e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -141,6 +141,17 @@ const yvec = rand(5)
 xvec2 = deepcopy(xvec)
 yvec2 = deepcopy(yvec)
 
+function test_higher_order_backend(backends...)
+    ADbackends = AD.HigherOrderBackend(backends)
+    @test backends[end] == AD.lowest(ADbackends)
+    @test backends[end-1] == AD.secondlowest(ADbackends)
+    
+    for i in length(backends):-1:1
+        @test backends[i] == AD.lowest(ADbackends)
+        ADbackends = AD.reduceorder(ADbackends)       
+    end    
+    backends[1] == AD.reduceorder(ADbackends)
+end
 
 function test_derivatives(backend; multiple_inputs=true)
     # test with respect to analytical solution
@@ -480,6 +491,9 @@ function test_lazy_hessians(backend; multiple_inputs=true)
 end
 
 @testset "AbstractDifferentiation.jl" begin
+    @testset "Utils" begin
+        test_higher_order_backend(fdm_backend1, fdm_backend2, fdm_backend3, zygote_backend1, forwarddiff_backend2)
+    end
     @testset "FiniteDifferences" begin
         @testset "Derivative" begin
             test_derivatives(fdm_backend1)
@@ -585,7 +599,11 @@ end
             test_jacobians(zygote_backend1)
         end
         @testset "Hessian" begin
-            test_hessians(zygote_backend1)
+            # Zygote over Zygote problems
+            backends = AD.HigherOrderBackend((forwarddiff_backend2,zygote_backend1))
+            test_hessians(backends)
+            backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend1))
+            test_hessians(backends)
         end
         @testset "jvp" begin
             test_jvp(zygote_backend1)
@@ -603,7 +621,15 @@ end
             test_lazy_jacobians(zygote_backend1)
         end
         @testset "Lazy Hessian" begin
-            test_lazy_hessians(zygote_backend1)
+            # Zygote over Zygote problems
+            backends = AD.HigherOrderBackend((forwarddiff_backend2,zygote_backend1))
+            test_lazy_hessians(backends)
+            backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend1))
+            test_lazy_hessians(backends)
         end
     end
 end
+
+
+backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend2))
+test_hessians(backends)
\ No newline at end of file

From eca9fda2975acae9976d2dee23537f6648da93d9 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 4 Aug 2021 15:52:14 +0200
Subject: [PATCH 23/25] comment failing test.

---
 test/runtests.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index aba4c0e..f4d8f20 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -604,6 +604,9 @@ end
             test_hessians(backends)
             backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend1))
             test_hessians(backends)
+            # fails:
+            # backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend2))
+            # test_hessians(backends)
         end
         @testset "jvp" begin
             test_jvp(zygote_backend1)
@@ -631,5 +634,3 @@ end
 end
 
 
-backends = AD.HigherOrderBackend((zygote_backend1,forwarddiff_backend2))
-test_hessians(backends)
\ No newline at end of file

From 96b1a263c4e0c2d9492dac20b89b0d145be5ae60 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 11 Aug 2021 11:06:39 +0200
Subject: [PATCH 24/25] xs ... -> x for Hessian input

---
 src/AbstractDifferentiation.jl | 81 +++++++++++++++++++++-------------
 test/runtests.jl               |  3 +-
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index 1ad7fd5..fc1f2e3 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -51,15 +51,15 @@ function jacobian(ab::HigherOrderBackend, f, xs...)
     jacobian(lowest(ab), f, xs...)
 end
 
-function hessian(ab::AbstractBackend, f, xs...)
-    if xs isa Tuple
+function hessian(ab::AbstractBackend, f, x)
+    if x isa Tuple
         # only support computation of Hessian for functions with single input argument
-        @assert length(xs) == 1
-        xs = xs[1]
+        @assert length(x) == 1
+        x = x[1]
     end
-    return jacobian(secondlowest(ab), xs -> begin
-        gradient(lowest(ab), f, xs)[1] # gradient returns a tuple
-    end, xs)
+    return jacobian(secondlowest(ab), x -> begin
+        gradient(lowest(ab), f, x)[1] # gradient returns a tuple
+    end, x)
 end
 
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
@@ -88,61 +88,82 @@ function value_and_jacobian(ab::AbstractBackend, f, xs...)
 
     return value, jacs
 end
-function value_and_hessian(ab::AbstractBackend, f, xs...)
+function value_and_hessian(ab::AbstractBackend, f, x)
+    if x isa Tuple
+        # only support computation of Hessian for functions with single input argument
+        @assert length(x) == 1
+        x = x[1]
+    end
+
     local value
     primalcalled = false
     if ab isa AbstractFiniteDifference
-        value = primalvalue(ab, nothing, f, xs)
+        value = primalvalue(ab, nothing, f, (x,))
         primalcalled = true
     end
-    hess = jacobian(secondlowest(ab), (_xs...,) -> begin
-        v, g = value_and_gradient(lowest(ab), f, _xs...)
+    hess = jacobian(secondlowest(ab), _x -> begin
+        v, g = value_and_gradient(lowest(ab), f, _x)
         if !primalcalled
-            value = primalvalue(ab, v, f, xs)
+            value = primalvalue(ab, v, f, (x,))
             primalcalled = true
         end
         return g[1] # gradient returns a tuple
-    end, xs...)
+    end, x)
     return value, hess
 end
-function value_and_hessian(ab::HigherOrderBackend, f, xs...)
+function value_and_hessian(ab::HigherOrderBackend, f, x)
+    if x isa Tuple
+        # only support computation of Hessian for functions with single input argument
+        @assert length(x) == 1
+        x = x[1]
+    end
     local value
     primalcalled = false
-    hess = jacobian(secondlowest(ab), (_xs...,) -> begin
-        v, g = value_and_gradient(lowest(ab), f, _xs...)
+    hess = jacobian(secondlowest(ab), (_x,) -> begin
+        v, g = value_and_gradient(lowest(ab), f, _x)
         if !primalcalled
-            value = primalvalue(ab, v, f, xs)
+            value = primalvalue(ab, v, f, (x,))
             primalcalled = true
         end
         return g[1]  # gradient returns a tuple
-    end, xs...)
+    end, x)
     return value, hess
 end
-function value_gradient_and_hessian(ab::AbstractBackend, f, xs...)
+function value_gradient_and_hessian(ab::AbstractBackend, f, x)
+    if x isa Tuple
+        # only support computation of Hessian for functions with single input argument
+        @assert length(x) == 1
+        x = x[1]
+    end
     local value
     primalcalled = false
-    grads, hess = value_and_jacobian(secondlowest(ab), (_xs...,) -> begin
-        v, g = value_and_gradient(lowest(ab), f, _xs...)
+    grads, hess = value_and_jacobian(secondlowest(ab), _x -> begin
+        v, g = value_and_gradient(lowest(ab), f, _x)
         if !primalcalled
-            value = primalvalue(secondlowest(ab), v, f, xs)
+            value = primalvalue(secondlowest(ab), v, f, (x,))
             primalcalled = true
         end
         return g[1] # gradient returns a tuple
-    end, xs...)
-    return value, (grads,) , hess
+    end, x)
+    return value, (grads,), hess
 end
-function value_gradient_and_hessian(ab::HigherOrderBackend, f, xs...)
+function value_gradient_and_hessian(ab::HigherOrderBackend, f, x)
+    if x isa Tuple
+        # only support computation of Hessian for functions with single input argument
+        @assert length(x) == 1
+        x = x[1]
+    end
     local value
     primalcalled = false
-    grads, hess = value_and_jacobian(secondlowest(ab), (_xs...,) -> begin
-        v, g = value_and_gradient(lowest(ab), f, _xs...)
+    grads, hess = value_and_jacobian(secondlowest(ab), _x -> begin
+        v, g = value_and_gradient(lowest(ab), f, _x)
         if !primalcalled
-            value = primalvalue(secondlowest(ab), v, f, xs)
+            value = primalvalue(secondlowest(ab), v, f, (x,))
             primalcalled = true
         end
         return g[1] # gradient returns a tuple
-    end, xs...)
-    return value, (grads,) , hess
+    end, x)
+    return value, (grads,), hess
 end
 
 function pushforward_function(
diff --git a/test/runtests.jl b/test/runtests.jl
index f4d8f20..1bf305a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -222,7 +222,8 @@ function test_hessians(backend; multiple_inputs=false)
     else
         # explicit test that AbstractDifferentiation throws an error
         # don't support tuple of Hessians
-        @test_throws AssertionError H1 = AD.hessian(backend, fgrad, xvec, yvec)
+        @test_throws AssertionError H1 = AD.hessian(backend, fgrad, (xvec, yvec))
+        @test_throws MethodError H1 = AD.hessian(backend, fgrad, xvec, yvec)
     end
    
     # @test dfgraddxdx(xvec,yvec) ≈ H1[1] atol=1e-10

From c384a1b840e8fa9b02ca7bd9695b1782073154a4 Mon Sep 17 00:00:00 2001
From: Frank Schaefer <frank.schaefer@unibas.ch>
Date: Wed, 11 Aug 2021 11:37:30 +0200
Subject: [PATCH 25/25] remove  unnecessary tuple conversion

---
 src/AbstractDifferentiation.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index fc1f2e3..96e9dc5 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -274,11 +274,7 @@ function value_and_pullback_function(
                 value = primalvalue(lowest(ab), vs, f, xs)
                 primalcalled = true
             end
-            if vs isa Tuple
-                return vs[1]
-            else
-                return vs
-            end
+            return vs
         end, xs...)(ws)
         return value, pb
     end