diff --git a/Project.toml b/Project.toml
index 854cc6c..cc91938 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,12 +10,13 @@ LazyStack = "1fad7336-0346-5a1a-a56f-a06ba010965b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 TransmuteDims = "24ddb15e-299a-5cc3-8414-dbddc482d9ca"
 
 [compat]
 ChainRulesCore = "1.11"
-Compat = "3.46, 4.2"  # for stack
+Compat = "3.46, 4.2"
 LazyArrays = "0.21, 0.22"
 LazyStack = "0.1.0"
 MacroTools = "0.5"
diff --git a/src/warm.jl b/src/warm.jl
index f4a9ab7..87ba062 100644
--- a/src/warm.jl
+++ b/src/warm.jl
@@ -2,6 +2,24 @@
 # time julia -e 'using TensorCast; TensorCast._macro(:(  Z[i,k][j] := fun(A[i,:], B[j])[k] + C[k]^2  ))'
 # 10.1 sec without this, 7.2 sec with (on 2nd run)
 
+#=
+# August 2022, Julia 1.9 master
+
+me@ArmBook TensorCast % time julia -e '@time (using TensorCast; TensorCast._macro(:(  Z[i,k][j] := fun(A[i,:], B[j])[k] + C[k]^2  )))'
+  3.874656 seconds (6.59 M allocations: 474.681 MiB, 1.79% gc time, 76.88% compilation time: <1% of which was recompilation)
+julia -e   5.39s user 1.77s system 134% cpu 5.330 total
+
+# With SnoopPrecompile
+
+me@ArmBook TensorCast % time julia -e '@time (using TensorCast; TensorCast._macro(:(  Z[i,k][j] := fun(A[i,:], B[j])[k] + C[k]^2  )))'
+  2.948221 seconds (2.71 M allocations: 194.022 MiB, 68.51% compilation time: <1% of which was recompilation)
+julia -e   4.50s user 1.74s system 141% cpu 4.408 total
+
+=#
+
+using SnoopPrecompile
+@precompile_all_calls begin
+
 _macro(:(  Z[i,j] := A[i] + B[j]  ))
 _macro(:(  Z[i,_,k'] := A[i] + B[k'] / log(2) ))
 _macro(:(  Z[(i,j)] := A[j,-i] ))
@@ -11,3 +29,5 @@ _macro(:(  S[i,j] := sum(k)  ),:(  (B[j]+ C[k])[i]  ), call=CallInfo(:reduce))
 
 pretty(:(  (B[j]+ C[k])[i] ))
 pretty(@macroexpand @cast A[i,j] := B[j,i] + 1 )
+
+end