From c555642172e281cae6da8a6cff4dfd9ff678ae85 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Tue, 12 Nov 2024 14:11:44 -0800 Subject: [PATCH] Bump to v2.7.0 --- README.md | 4 ++++ flash_attn/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 054af18c4..dcd558aeb 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,10 @@ Thanks to @beginlner for this contribution. Support attention with softcapping, as used in Gemma-2 and Grok models. Thanks to @Narsil and @lucidrains for this contribution. +### 2.7: Compatibility with torch compile + +Thanks to @ani300 for this contribution. + ## Performance We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory). diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py index e235d9274..218e299df 100644 --- a/flash_attn/__init__.py +++ b/flash_attn/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.6.3" +__version__ = "2.7.0" from flash_attn.flash_attn_interface import ( flash_attn_func,