pytorch
diff --git a/‎.github/workflows/assigner.yml
+3 b/‎.github/workflows/assigner.yml
+3
diff --git a/‎.github/workflows/build-test.yml
+3-3 b/‎.github/workflows/build-test.yml
+3-3
diff --git a/‎.github/workflows/label.yml
+4-3 b/‎.github/workflows/label.yml
+4-3
diff --git a/‎.github/workflows/linux-test.yml
+4-1 b/‎.github/workflows/linux-test.yml
+4-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎core/conversion/converters/BUILD
100755100644
+1 b/‎core/conversion/converters/BUILD
100755100644
+1
diff --git a/‎core/conversion/converters/impl/internal_ops.cpp
+46 b/‎core/conversion/converters/impl/internal_ops.cpp
+46
diff --git a/‎core/conversion/converters/impl/unary.cpp
+16-1 b/‎core/conversion/converters/impl/unary.cpp
+16-1
diff --git a/‎core/lowering/lowering.cpp
+1 b/‎core/lowering/lowering.cpp
+1
diff --git a/‎core/lowering/passes/BUILD
+1 b/‎core/lowering/passes/BUILD
+1
diff --git a/‎core/lowering/passes/passes.h
+1 b/‎core/lowering/passes/passes.h
+1
diff --git a/‎core/lowering/passes/unpack_scaled_dot_product_attention.cpp
+94 b/‎core/lowering/passes/unpack_scaled_dot_product_attention.cpp
+94
diff --git a/‎core/lowering/register_trt_placeholder_ops.cpp
+12 b/‎core/lowering/register_trt_placeholder_ops.cpp
+12
diff --git a/‎cpp/include/torch_tensorrt/macros.h
+1-1 b/‎cpp/include/torch_tensorrt/macros.h
+1-1
diff --git a/‎dev_dep_versions.yml
+1-1 b/‎dev_dep_versions.yml
+1-1
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1DataType.html
+2-2
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1Device_1_1DeviceType.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1Device_1_1DeviceType.html
+2-2
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1TensorFormat.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1TensorFormat.html
+2-2
diff --git a/‎docs/_cpp_api/classtorch__tensorrt_1_1ptq_1_1Int8CacheCalibrator.html
+2-2 b/‎docs/_cpp_api/classtorch__tensorrt_1_1ptq_1_1Int8CacheCalibrator.html
+2-2
@@ -16,6 +16,9 @@ on:
 
 jobs:
   assign:
+    permissions:
+      contents: read
+      pull-requests: write
     runs-on: ubuntu-latest
     steps:
     - name: Checkout
 
@@ -26,6 +26,9 @@ jobs:
 
   build:
     needs: generate-matrix
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
       matrix:
@@ -50,9 +53,6 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
-    secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
 
   tests-py-torchscript-fe:
     name: Test torchscript frontend [Python]
 
@@ -10,11 +10,12 @@ on: [pull_request_target]
 
 jobs:
   label:
-
+    permissions:
+      contents: read
+      pull-requests: write
     runs-on: ubuntu-latest
-
     steps:
-    - uses: actions/labeler@v2
+    - uses: actions/labeler@v4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: .github/pr-labels.yml
@@ -67,6 +67,7 @@ jobs:
       CU_VERSION: ${{ matrix.desired_cuda }}
       SCRIPT: ${{ inputs.script }}
       RUNNER_TEST_RESULTS_DIR: /tmp/test_results
+      ARCH: ${{ inputs.architecture }}
     name: ${{ inputs.job-name }}-${{ matrix.desired_cuda }}
     runs-on: ${{ matrix.validation_runner }}
     container:
@@ -100,6 +101,8 @@ jobs:
           ref: ${{ inputs.ref }}
           setup-miniconda: ${{ inputs.setup-miniconda }}
           python-version: ${{ env.PYTHON_VERSION }}
+          cuda-version: ${{ env.CU_VERSION }}
+          arch: ${{ env.ARCH }}
       - name: Run Pre-Script with Caching
         if: ${{ inputs.pre-script != '' }}
         uses: ./test-infra/.github/actions/run-script-with-cache
@@ -191,4 +194,4 @@ jobs:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
-  cancel-in-progress: true
+  cancel-in-progress: true
@@ -116,7 +116,7 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 5.2.0
-- Libtorch 2.2.0.dev (latest nightly) (built with CUDA 12.1)
+- Libtorch 2.3.0.dev (latest nightly) (built with CUDA 12.1)
 - CUDA 12.1
 - cuDNN 8.9.5
 - TensorRT 8.6.1
 
@@ -66,6 +66,7 @@ cc_library(
         "impl/einsum.cpp",
         "impl/element_wise.cpp",
         "impl/expand.cpp",
+        "impl/internal_ops.cpp",
         "impl/interpolate.cpp",
         "impl/layer_norm.cpp",
         "impl/linear.cpp",
 
@@ -0,0 +1,46 @@
+#include "core/conversion/converters/converters.h"
+#include "core/util/prelude.h"
+#include "torch/torch.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+auto linear_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(
+    {"trt::attn_bias_from_attn_mask(Tensor attn_mask) -> Tensor",
+     [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+       // Converter for internal op used in unpack_scaled_dot_product_attention
+       // We don't have visibility to check types during lowering and can't introduce conditionals so do type specific
+       // specialization here
+       auto in = args[0].ITensorOrFreeze(ctx);
+       auto out = in;
+       if (in->getType() == nvinfer1::DataType::kBOOL) {
+         auto not_layer = ctx->net->addUnary(*in, nvinfer1::UnaryOperation::kNOT);
+         TORCHTRT_CHECK(not_layer, "Unable to create not layer for attn_bias_from_attn_mask");
+         not_layer->setName((util::node_info(n) + "_not").c_str());
+         auto neg_inf = torch::tensor(-std::numeric_limits<float>::infinity());
+         auto neg_inf_itensor = tensor_to_const(ctx, neg_inf);
+         auto prod_layer = add_elementwise(
+             ctx,
+             nvinfer1::ElementWiseOperation::kPROD,
+             not_layer->getOutput(0),
+             neg_inf_itensor,
+             util::node_info(n) + "_mul");
+         auto add_layer = add_elementwise(
+             ctx, nvinfer1::ElementWiseOperation::kSUM, prod_layer->getOutput(0), in, util::node_info(n) + "_add");
+         out = add_layer->getOutput(0);
+       }
+       auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], out);
+       LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+       LOG_DEBUG("Output tensor type: " << out_tensor->getType());
+       return true;
+     }});
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace torch_tensorrt
@@ -79,6 +79,22 @@ auto logical_not_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns()
        return true;
      }});
 
+auto sqrt_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(
+    {"aten::sqrt(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+       auto in = args[0].ITensorOrFreeze(ctx);
+       if (in->getType() == nvinfer1::DataType::kINT32) {
+         // unary sqrt layer only supports float inputs
+         in = castITensor(ctx, in, nvinfer1::DataType::kFLOAT, util::node_info(n).c_str());
+       }
+       auto unary_layer = ctx->net->addUnary(*in, nvinfer1::UnaryOperation::kSQRT);
+       TORCHTRT_CHECK(unary_layer, "Unable to create sqrt layer from node: " << *n);
+       unary_layer->setName(util::node_info(n).c_str());
+       unary_layer->setOutputType(0, in->getType());
+       auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], unary_layer->getOutput(0));
+       LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+       return true;
+     }});
+
 auto isfinite_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(
     {"aten::isfinite(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
        auto in = args[0].ITensorOrFreeze(ctx);
@@ -126,7 +142,6 @@ convert(atan, kATAN);
 convert(floor, kFLOOR);
 convert(log, kLOG);
 convert(ceil, kCEIL);
-convert(sqrt, kSQRT);
 convert(exp, kEXP);
 convert(neg, kNEG);
 convert(erf, kERF);
 
@@ -146,6 +146,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::I
   if (lower_info.converting_to_trt_engine) {
     passes::RemoveCollectionCast(g);
   }
+  passes::UnpackScaledDotProductAttention(g);
   passes::UnpackAndCastMaskedFill(g, lower_info.getGPUDeviceString());
   passes::UnpackAndCastNumToTensor(g, lower_info.getGPUDeviceString());
   passes::UnpackAndCastFull(g, lower_info.getGPUDeviceString());
 
@@ -38,6 +38,7 @@ cc_library(
         "unpack_hardswish.cpp",
         "unpack_log_softmax.cpp",
         "unpack_rsqrt.cpp",
+        "unpack_scaled_dot_product_attention.cpp",
         "unpack_std.cpp",
         "unpack_var.cpp",
         "view_to_reshape.cpp",
 
@@ -49,6 +49,7 @@ void UnpackHardSigmoid(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
 void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
 void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph, std::string target_device_name);
+void UnpackScaledDotProductAttention(std::shared_ptr<torch::jit::Graph>& graph);
 void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph);
 void ReplaceAtenPad(std::shared_ptr<torch::jit::Graph>& graph);
 void ReplaceTileWithRepeat(std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -0,0 +1,94 @@
+#include "torch/csrc/jit/ir/subgraph_matcher.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/util/prelude.h"
+#include "torch/csrc/jit/ir/irparser.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+// https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+void UnpackScaledDotProductAttention(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string sdpa_pattern = R"IR(
+    graph(%query, %key, %value, %attn_mask, %dropout_p, %is_causal):
+      %out: Tensor = aten::scaled_dot_product_attention(%query, %key, %value, %attn_mask, %dropout_p, %is_causal)
+      return (%out))IR";
+
+  std::string unpacked_sdpa_pattern = R"IR(
+    graph(%query, %key, %value, %attn_mask, %dropout_p, %is_causal):
+      %none : NoneType = prim::Constant()
+      %1 : int = prim::Constant[value=-1]()
+      %2 : int = prim::Constant[value=-2]()
+      %3 : int = aten::size(%query, %1)
+      %q_size : Long() = prim::NumToTensor(%3)
+      %sqrt : Tensor = aten::sqrt(%q_size)
+      %scale_factor : Tensor = aten::reciprocal(%sqrt)
+      %key_transpose : Tensor = aten::transpose(%key, %2, %1)
+      %matmul : Tensor = aten::matmul(%query, %key_transpose)
+      %attn_weight : Tensor = aten::mul(%matmul, %scale_factor)
+      %softmax : Tensor = aten::softmax(%attn_weight, %1, %none)
+      %out : Tensor = aten::matmul(%softmax, %value)
+      return(%out))IR";
+
+  std::string unpacked_sdpa_attn_biased_pattern = R"IR(
+    graph(%query, %key, %value, %attn_mask, %dropout_p, %is_causal):
+      %none : NoneType = prim::Constant()
+      %0 : int = prim::Constant[value=1]()
+      %1 : int = prim::Constant[value=-1]()
+      %2 : int = prim::Constant[value=-2]()
+      %3 : int = aten::size(%query, %1)
+      %q_size : Long() = prim::NumToTensor(%3)
+      %sqrt : Tensor = aten::sqrt(%q_size)
+      %scale_factor : Tensor = aten::reciprocal(%sqrt)
+      %key_transpose : Tensor = aten::transpose(%key, %2, %1)
+      %matmul : Tensor = aten::matmul(%query, %key_transpose)
+      %attn_weight : Tensor = aten::mul(%matmul, %scale_factor)
+      %attn_bias : Tensor = trt::attn_bias_from_attn_mask(%attn_mask)
+      %attn_weight_with_bias : Tensor = aten::add(%attn_weight, %attn_bias, %0)
+      %softmax : Tensor = aten::softmax(%attn_weight_with_bias, %1, %none)
+      %out : Tensor = aten::matmul(%softmax, %value)
+      return(%out))IR";
+
+  // rewrite with None attn_mask
+  torch::jit::SubgraphRewriter sdpa_rewriter;
+  sdpa_rewriter.RegisterRewritePattern(sdpa_pattern, unpacked_sdpa_pattern);
+  sdpa_rewriter.runOnGraph(
+      graph, [](const torch::jit::Match& match, const std::unordered_map<std::string, torch::jit::Value*>&) {
+        auto is_causal_node = match.anchor->inputs().at(5)->node();
+        if (is_causal_node->kind() != at::prim::Constant) {
+          LOG_WARNING("Could not unpack scaled_dot_product_attention with non constant is_causal: " << *is_causal_node);
+          return false;
+        }
+        if (is_causal_node->i(at::attr::value) == 1) {
+          LOG_WARNING("Could not unpack scaled_dot_product_attention with is_causal = True: " << *is_causal_node);
+          return false;
+        }
+        auto attn_mask_node = match.anchor->inputs().at(3)->node();
+        if (attn_mask_node->kind() != at::prim::Constant || !attn_mask_node->mustBeNone()) {
+          return false;
+        }
+        return true;
+      });
+
+  // rewrite with float/bool attn_mask this uses a custom op to implement the divergent behavior between bool and float
+  // masks without a conditional
+  torch::jit::SubgraphRewriter sdpa_attn_mask_rewriter;
+  sdpa_attn_mask_rewriter.RegisterRewritePattern(sdpa_pattern, unpacked_sdpa_attn_biased_pattern);
+  sdpa_attn_mask_rewriter.runOnGraph(
+      graph, [](const torch::jit::Match& match, const std::unordered_map<std::string, torch::jit::Value*>&) {
+        auto is_causal_node = match.anchor->inputs().at(5)->node();
+        if (is_causal_node->kind() != at::prim::Constant || is_causal_node->i(at::attr::value) == 1) {
+          // messages already written in first pass, do not write again
+          return false;
+        }
+        return true;
+      });
+  LOG_GRAPH("Post unpack scaled_dot_product_attention: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt
@@ -1,3 +1,4 @@
+#include <limits>
 #include "torch/csrc/jit/runtime/custom_operator.h"
 
 namespace torch {
@@ -14,6 +15,17 @@ RegisterOperators trt_placeholder_ops_reg({
         "trt::const(Tensor val) -> Tensor",
         [](Stack& stack) { /*noop*/ },
         aliasAnalysisFromSchema()),
+    Operator(
+        "trt::attn_bias_from_attn_mask(Tensor attn_mask) -> Tensor",
+        [](Stack& stack) {
+          auto attn_mask = pop(stack).to<at::Tensor>();
+          if (attn_mask.scalar_type() == at::kBool) {
+            attn_mask = attn_mask;
+            attn_mask.masked_fill_(attn_mask.logical_not(), -std::numeric_limits<float>::infinity());
+          }
+          return attn_mask;
+        },
+        c10::AliasAnalysisKind::CONSERVATIVE),
 });
 
 } // namespace jit
 
@@ -24,7 +24,7 @@
 #define STR(x) XSTR(x)
 
 #define TORCH_TENSORRT_MAJOR_VERSION 2
-#define TORCH_TENSORRT_MINOR_VERSION 2
+#define TORCH_TENSORRT_MINOR_VERSION 3
 #define TORCH_TENSORRT_PATCH_VERSION 0
 #define TORCH_TENSORRT_VERSION      \
   STR(TORCH_TENSORRT_MAJOR_VERSION) \
 
@@ -1,4 +1,4 @@
-__version__: "2.2.0.dev0"
+__version__: "2.3.0.dev0"
 __cuda_version__: "12.1"
 __cudnn_version__: "8.9"
 __tensorrt_version__: "8.6"
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class DataType &mdash; Torch-TensorRT v2.2.0.dev0+c7a26fa documentation</title>
+  <title>Class DataType &mdash; Torch-TensorRT v2.3.0.dev0+85971ff documentation</title>
 
 
 
@@ -237,7 +237,7 @@
 
 
                 <div class="version">
-                  v2.2.0.dev0+c7a26fa
+                  v2.3.0.dev0+85971ff
                 </div>
 
 
 
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class Device::DeviceType &mdash; Torch-TensorRT v2.2.0.dev0+c7a26fa documentation</title>
+  <title>Class Device::DeviceType &mdash; Torch-TensorRT v2.3.0.dev0+85971ff documentation</title>
 
 
 
@@ -237,7 +237,7 @@
 
 
                 <div class="version">
-                  v2.2.0.dev0+c7a26fa
+                  v2.3.0.dev0+85971ff
                 </div>
 
 
 
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Class TensorFormat &mdash; Torch-TensorRT v2.2.0.dev0+c7a26fa documentation</title>
+  <title>Class TensorFormat &mdash; Torch-TensorRT v2.3.0.dev0+85971ff documentation</title>
 
 
 
@@ -237,7 +237,7 @@
 
 
                 <div class="version">
-                  v2.2.0.dev0+c7a26fa
+                  v2.3.0.dev0+85971ff
                 </div>
 
 
 
@@ -10,7 +10,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
-  <title>Template Class Int8CacheCalibrator &mdash; Torch-TensorRT v2.2.0.dev0+c7a26fa documentation</title>
+  <title>Template Class Int8CacheCalibrator &mdash; Torch-TensorRT v2.3.0.dev0+85971ff documentation</title>
 
 
 
@@ -237,7 +237,7 @@
 
 
                 <div class="version">
-                  v2.2.0.dev0+c7a26fa
+                  v2.3.0.dev0+85971ff
                 </div>
Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::I`
`146`	`146`	`if (lower_info.converting_to_trt_engine) {`
`147`	`147`	`passes::RemoveCollectionCast(g);`
`148`	`148`	`}`
	`149`	`+ passes::UnpackScaledDotProductAttention(g);`
`149`	`150`	`passes::UnpackAndCastMaskedFill(g, lower_info.getGPUDeviceString());`
`150`	`151`	`passes::UnpackAndCastNumToTensor(g, lower_info.getGPUDeviceString());`
`151`	`152`	`passes::UnpackAndCastFull(g, lower_info.getGPUDeviceString());`