diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 08f242dcb..4746c7149 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -154,7 +154,8 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           # Limit MAX_JOBS otherwise the github runner goes OOM
-          MAX_JOBS=2 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
+          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index 50e062b4f..ac7517340 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.7.2"
+__version__ = "2.7.2.post1"
 
 from flash_attn.flash_attn_interface import (
     flash_attn_func,
diff --git a/setup.py b/setup.py
index d7d16b642..a567063fb 100644
--- a/setup.py
+++ b/setup.py
@@ -149,13 +149,6 @@ def validate_and_update_archs(archs):
     TORCH_MAJOR = int(torch.__version__.split(".")[0])
     TORCH_MINOR = int(torch.__version__.split(".")[1])
 
-    # Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
-    # See https://github.com/pytorch/pytorch/pull/70650
-    generator_flag = []
-    torch_dir = torch.__path__[0]
-    if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
-        generator_flag = ["-DOLD_GENERATOR_PATH"]
-
     check_if_cuda_home_none("flash_attn")
     # Check, if CUDA11 is installed for compute capability 8.0
     cc_flag = []
@@ -271,7 +264,7 @@ def validate_and_update_archs(archs):
                 "csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3", "-std=c++17"] + generator_flag,
+                "cxx": ["-O3", "-std=c++17"],
                 "nvcc": append_nvcc_threads(
                     [
                         "-O3",
@@ -293,7 +286,6 @@ def validate_and_update_archs(archs):
                         # "-DFLASHATTENTION_DISABLE_UNEVEN_K",
                         # "-DFLASHATTENTION_DISABLE_LOCAL",
                     ]
-                    + generator_flag
                     + cc_flag
                 ),
             },