From 54a47b4aebee66ea1ed999cbfb35372b3254c94a Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Fri, 8 Nov 2019 18:29:57 -0800
Subject: [PATCH 01/11] Fuse transpose into MatMul Implement Pow and constant
 scalar simplification

---
 cmake/CMakeLists.txt                          |   2 +-
 .../core/codegen/mti/math/matmul_ops.cc       |  36 ++-
 .../core/codegen/mti/math/matmul_ops.h        |   4 +-
 .../passes/op_ir_creator/math/unary_funcs.h   |  51 ++++
 .../passes/op_ir_creator/math/unary_ops.cc    |  45 +---
 .../nuphar/compiler/codegen_manager.cc        |   8 +
 .../compiler/x86/op_ir_creator/all_ops.h      |  26 +-
 .../x86/op_ir_creator/math/binary_ops.cc      | 105 ++++++++
 .../compiler/x86/op_ir_creator/math/gemm.cc   |   2 +-
 .../compiler/x86/op_ir_creator/math/matmul.cc | 107 ++++++--
 .../x86/op_ir_creator/math/unary_ops.cc       |  50 +---
 onnxruntime/core/providers/nuphar/kernel.cc   |   2 +
 onnxruntime/core/providers/nuphar/kernel.h    | 205 +++++++--------
 .../nuphar/mti_x86/math/halide_ops.cc         |   2 +-
 .../nuphar/mti_x86/math/halide_ops.h          |   2 +
 .../nuphar/mti_x86/math/matmul_ops.cc         | 234 ++++++++++++------
 .../nuphar/mti_x86/math/matmul_ops.h          |  12 +-
 .../core/providers/nuphar/mti_x86/math/pow.cc |  42 ++++
 .../core/providers/nuphar/mti_x86/math/pow.h  |  16 ++
 .../nuphar/partition/subgraph_partitioner.cc  |   7 +-
 .../nuphar/scripts/symbolic_shape_infer.py    |  12 +-
 21 files changed, 655 insertions(+), 315 deletions(-)
 create mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h
 create mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
 create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc
 create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/math/pow.h

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 52f0d60458ef9..4a8a4c3f7e189 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -188,7 +188,7 @@ if (MSVC)
   SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100")
   if (NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
-    SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
+    #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
   endif()
   check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
   if (HAS_QSPECTRE)
diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.cc b/onnxruntime/core/codegen/mti/math/matmul_ops.cc
index 46f2fb75b6e24..1188cd874f7fe 100644
--- a/onnxruntime/core/codegen/mti/math/matmul_ops.cc
+++ b/onnxruntime/core/codegen/mti/math/matmul_ops.cc
@@ -124,22 +124,36 @@ tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string
 tvm::Array<tvm::Expr>
 ComputeMatMulShape(
     const tvm::Array<tvm::Expr>& A_shape,
-    const tvm::Array<tvm::Expr>& B_shape) {
+    const tvm::Array<tvm::Expr>& B_shape,
+    bool trans_a,
+    bool trans_b) {
   auto a_rank = A_shape.size();
   auto b_rank = B_shape.size();
   tvm::Array<tvm::Expr> output_shape;
   int64_t output_rank = std::max(a_rank, b_rank);
-  MTI_ASSERT(tvm::ir::Equal(A_shape[a_rank - 1], B_shape[b_rank - 2]));
-  for (int64_t i = 0; i < output_rank - 2; i++) {
-    tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1);
-    bool broadcasted =
-        BroadcastDim(A_shape, i, output_rank, broadcasted_dim) &&
-        BroadcastDim(B_shape, i, output_rank, broadcasted_dim);
-    MTI_ASSERT(broadcasted);
-    output_shape.push_back(broadcasted_dim);
+  MTI_ASSERT(a_rank > 0 && b_rank > 0);
+  if (a_rank == 1 && b_rank == 1) {
+    MTI_ASSERT(!trans_a && !trans_b);
+    // reduction, output shape is empty
+  } else if (a_rank == 1) {
+    MTI_ASSERT(!trans_a && !trans_b);
+    output_shape = SliceShapeToDimension(B_shape, b_rank - 2);
+    output_shape.push_back(B_shape[b_rank - 1]);
+  } else if (b_rank == 1) {
+    MTI_ASSERT(!trans_a && !trans_b);
+    output_shape = SliceShapeToDimension(A_shape, a_rank - 1);
+  } else {
+    for (int64_t i = 0; i < output_rank - 2; i++) {
+      tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1);
+      bool broadcasted =
+          BroadcastDim(A_shape, i, output_rank, broadcasted_dim) &&
+          BroadcastDim(B_shape, i, output_rank, broadcasted_dim);
+      MTI_ASSERT(broadcasted);
+      output_shape.push_back(broadcasted_dim);
+    }
+    output_shape.push_back(A_shape[a_rank - (trans_a ? 1 : 2)]);
+    output_shape.push_back(B_shape[b_rank - (trans_b ? 2 : 1)]);
   }
-  output_shape.push_back(A_shape[a_rank - 2]);
-  output_shape.push_back(B_shape[b_rank - 1]);
   return output_shape;
 }
 
diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.h b/onnxruntime/core/codegen/mti/math/matmul_ops.h
index 7180b4f6d81e5..ab9986132d34a 100644
--- a/onnxruntime/core/codegen/mti/math/matmul_ops.h
+++ b/onnxruntime/core/codegen/mti/math/matmul_ops.h
@@ -11,7 +11,9 @@ namespace tvm_codegen {
 tvm::Array<tvm::Expr>
 ComputeMatMulShape(
     const tvm::Array<tvm::Expr>& A_shape,
-    const tvm::Array<tvm::Expr>& B_shape);
+    const tvm::Array<tvm::Expr>& B_shape,
+    bool trans_a = false,
+    bool trans_b = false);
 
 tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d");
 
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h
new file mode 100644
index 0000000000000..29e6519af0ef1
--- /dev/null
+++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/op_kernel_info.h"
+
+namespace onnxruntime {
+namespace tvm_codegen {
+// helper class for unary_ops with alpha
+class FuncWithAlpha {
+ public:
+  FuncWithAlpha(const Node& node) {
+    ProtoHelperNodeContext ctx(node);
+    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
+    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
+  }
+
+ protected:
+  float alpha_;
+};
+
+// helper class for unary_ops with alpha and beta
+class FuncWithAlphaBeta {
+ public:
+  FuncWithAlphaBeta(const Node& node) {
+    ProtoHelperNodeContext ctx(node);
+    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
+    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
+    ORT_ENFORCE(attrs.GetAttr<float>("beta", &beta_).IsOK());
+  }
+
+ protected:
+  float alpha_;
+  float beta_;
+};
+
+// helper class for unary_ops with alpha and gamma
+class FuncWithAlphaGamma {
+ public:
+  FuncWithAlphaGamma(const Node& node) {
+    ProtoHelperNodeContext ctx(node);
+    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
+    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
+    ORT_ENFORCE(attrs.GetAttr<float>("gamma", &gamma_).IsOK());
+  }
+
+ protected:
+  float alpha_;
+  float gamma_;
+};
+}  // namespace tvm_codegen
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc
index bd5b89c718435..0407c0a06abf6 100644
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc
+++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc
@@ -5,54 +5,11 @@
 
 #include "core/codegen/common/op_macro.h"
 #include "core/codegen/mti/math/unary_ops.h"
-#include "core/framework/op_kernel_info.h"
+#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h"
 
 namespace onnxruntime {
 namespace tvm_codegen {
 
-// helper class for unary_ops with alpha
-class FuncWithAlpha {
- public:
-  FuncWithAlpha(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-  }
-
- protected:
-  float alpha_;
-};
-
-// helper class for unary_ops with alpha and beta
-class FuncWithAlphaBeta {
- public:
-  FuncWithAlphaBeta(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("beta", &beta_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float beta_;
-};
-
-// helper class for unary_ops with alpha and gamma
-class FuncWithAlphaGamma {
- public:
-  FuncWithAlphaGamma(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("gamma", &gamma_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float gamma_;
-};
-
 // helper macro declares unary_ops helper class without attribute
 #define FuncClass(name)                                  \
   class Func##name {                                     \
diff --git a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
index 582ada8f3b944..3879bda9fe66e 100644
--- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
@@ -30,6 +30,8 @@ namespace nuphar {
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)
+#define BINARY_OP(name) ADD_OP_ITEM(name)
+#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
 
 static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_registry) {
   LIST_ALL_X86_OPS()
@@ -39,6 +41,8 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re
 #undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP
+#undef BINARY_OP
+#undef BINARY_CMP_OP
 
 // END: NupharTVM X86 IR creator classes
 
@@ -138,6 +142,8 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)
+#define BINARY_OP(name) ADD_OP_ITEM(name)
+#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
 
 static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIRBuilder>& builder,
                                         const tvm_codegen::OpIRRegistry* registry) {
@@ -150,6 +156,8 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIR
 #undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP
+#undef BINARY_OP
+#undef BINARY_CMP_OP
 // END: Nuphar TVM X86 IR creator classes
 
 // 2 Plugin Scheduler classes
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
index 6f7304908e59f..5a37ac21a9767 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
@@ -21,10 +21,10 @@ namespace nuphar {
 #define NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(OP) \
   STRINGIZE(NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP))
 
-#define LIST_X86_POOL_OPS()  \
-  POOL_OP(MaxPool)           \
-  POOL_OP(AveragePool)       \
-  POOL_OP(GlobalMaxPool)     \
+#define LIST_X86_POOL_OPS() \
+  POOL_OP(MaxPool)          \
+  POOL_OP(AveragePool)      \
+  POOL_OP(GlobalMaxPool)    \
   POOL_OP(GlobalAveragePool)
 
 #define LIST_X86_UNARY_OPS()   \
@@ -38,6 +38,18 @@ namespace nuphar {
   UNARY_OP(Softplus)           \
   UNARY_OP(Tanh)
 
+#define LIST_X86_BINARY_OPS() \
+  BINARY_OP(Add)              \
+  BINARY_OP(Div)              \
+  BINARY_OP(Mul)              \
+  BINARY_OP(Pow)              \
+  BINARY_OP(Sub)
+
+#define LIST_X86_BINARY_CMP_OPS() \
+  BINARY_CMP_OP(Equal)            \
+  BINARY_CMP_OP(Greater)          \
+  BINARY_CMP_OP(Less)
+
 #define LIST_REDUCE_V_OPS() \
   REDUCE_V_OP(ReduceMax)    \
   REDUCE_V_OP(ReduceMin)    \
@@ -46,6 +58,8 @@ namespace nuphar {
 #define LIST_ALL_X86_OPS()     \
   LIST_REDUCE_V_OPS()          \
   LIST_X86_POOL_OPS()          \
+  LIST_X86_BINARY_OPS()        \
+  LIST_X86_BINARY_CMP_OPS()    \
   LIST_X86_UNARY_OPS()         \
   ADD_OP_ITEM(Gemm)            \
   ADD_OP_ITEM(LogSoftmax)      \
@@ -62,6 +76,8 @@ namespace nuphar {
 #define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP)
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(OP) ADD_OP_ITEM(OP)
+#define BINARY_OP(OP) ADD_OP_ITEM(OP)
+#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP)
 #define UNARY_OP(OP) ADD_OP_ITEM(OP)
 
 LIST_ALL_X86_OPS()
@@ -69,6 +85,8 @@ LIST_ALL_X86_OPS()
 #undef ADD_OP_ITEM
 #undef REDUCE_V_OP
 #undef POOL_OP
+#undef BINARY_OP
+#undef BINARY_CMP_OP
 #undef UNARY_OP
 
 }  // namespace nuphar
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
new file mode 100644
index 0000000000000..2e5ed9b2551de
--- /dev/null
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
@@ -0,0 +1,105 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
+
+#include "core/codegen/common/op_macro.h"
+#include "core/codegen/mti/math/binary_ops.h"
+#include "core/codegen/mti/tensor/cast_ops.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/common.h"
+#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
+#include "core/providers/nuphar/mti_x86/math/pow.h"
+
+namespace onnxruntime {
+using namespace tvm_codegen;
+
+namespace nuphar {
+
+bool HandleConstantScalar(tvm::Expr& scalar, size_t i, const Node& node, CodeGenContext& ctx_codegen) {
+  ProtoHelperNodeContext ctx(node);
+  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
+  NupharCodeGenCtx* ctx_nuphar = Promote<NupharCodeGenCtx>(&ctx_codegen);
+
+  ORT_ENFORCE(i < node.InputDefs().size());
+  const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name());
+
+  if (!tensor || tensor->Shape().Size() > 1)
+    return false;  // return if not constant or not scalar
+
+#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type)                      \
+  if (utils::IsPrimitiveDataType<tensor_type>(tensor->DataType())) {  \
+    scalar = tvm::make_const(tvm_type, *tensor->Data<tensor_type>()); \
+  }
+
+#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \
+  else ASSIGN_TVM_SCALAR(tvm_type, tensor_type)
+
+  ASSIGN_TVM_SCALAR(HalideIR::Float(32), float)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double)
+  else {
+    return false;
+  }
+
+#undef ASSIGN_TVM_SCALAR
+
+  return true;
+}
+
+// helper local macro defines Evaluate of BINARY_OP OpIRCreators
+#define BINARY_OP(name)                                                     \
+  Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate(                \
+      const tvm::Array<tvm::Tensor>& inputs,                                \
+      const Node& node,                                                     \
+      CodeGenContext& ctx_codegen,                                          \
+      tvm::Array<tvm::Tensor>& outputs) {                                   \
+    tvm::Expr scalar0, scalar1;                                             \
+    bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \
+    bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \
+    tvm::Tensor Y;                                                          \
+    if (use_scalar0)                                                        \
+      Y = name(scalar0, inputs[1], node.Name());                            \
+    else if (use_scalar1)                                                   \
+      Y = name(inputs[0], scalar1, node.Name());                            \
+    else                                                                    \
+      Y = name(inputs[0], inputs[1], node.Name());                          \
+    outputs.push_back(Y);                                                   \
+    return Status::OK();                                                    \
+  }
+
+LIST_X86_BINARY_OPS()
+
+#undef BINARY_OP
+
+// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators
+#define BINARY_CMP_OP(name)                                                 \
+  Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate(                \
+      const tvm::Array<tvm::Tensor>& inputs,                                \
+      const Node& node,                                                     \
+      CodeGenContext& ctx_codegen,                                          \
+      tvm::Array<tvm::Tensor>& outputs) {                                   \
+    tvm::Expr scalar0, scalar1;                                             \
+    bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \
+    bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \
+    tvm::Tensor Y;                                                          \
+    if (use_scalar0)                                                        \
+      Y = name(scalar0, inputs[1], node.Name());                            \
+    else if (use_scalar1)                                                   \
+      Y = name(inputs[0], scalar1, node.Name());                            \
+    else                                                                    \
+      Y = name(inputs[0], inputs[1], node.Name());                          \
+    Y = Cast(Y, HalideIR::UInt(8), "cast_bool_" #name);                     \
+    outputs.push_back(Y);                                                   \
+    return Status::OK();                                                    \
+  }
+
+LIST_X86_BINARY_CMP_OPS()
+
+#undef BINARY_CMP_OP
+
+}  // namespace nuphar
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc
index 5ac2adf738017..a248af400edf9 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc
@@ -33,7 +33,7 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Gemm)::Evaluate(
   // use native sgemm for floating point
   if (A->dtype == HalideIR::Float(32) &&
       B->dtype == HalideIR::Float(32) &&
-      MatMulExternCpu(A, B, Y, !!trans_a, !!trans_b, node.Name() + "_gemm")) {
+      GemmExternCpu(A, B, Y, !!trans_a, !!trans_b, node.Name() + "_gemm")) {
     if (beta != 0) {
       tvm::Tensor beta_bias = (beta == 1) ? C : tvm_codegen::Mul(tvm::make_const(tvm::Float(32), beta), C);
       Y = tvm_codegen::Add((alpha == 1) ? Y : tvm_codegen::Mul(tvm::make_const(tvm::Float(32), alpha), Y), beta_bias, node.Name() + "_add_bias");
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
index e81ef497c50a8..f766fe98ad7c1 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
@@ -1,14 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
-
-#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
-#include "core/providers/nuphar/mti_x86/math/matmul_ops.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/passes/weight_layout/transpose_2d.h"
 #include "core/codegen/passes/weight_layout/vertical_stripes_2d.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
+#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
+#include "core/providers/nuphar/mti_x86/math/matmul_ops.h"
 
 #include <tvm/ir_pass.h>
 
@@ -89,29 +89,86 @@ static bool MatMul_weights2D(
   return true;
 }
 
-static bool MatMulF32ExternCpuEx(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    NupharCodeGenCtx& ctx_nuphar,
-    const tvm::Tensor& A,
-    const tvm::Tensor& B,
+static bool MatMulF32ExternCPU(
+    tvm::Tensor A,
+    tvm::Tensor B,
     tvm::Tensor& Y,
-    const std::string& B_initializer_name = "",
-    bool trans_a = false,
-    bool trans_b = false,
-    const std::string& name = "matmul_extern_cpu_ex") {
-  // transpose weights if not already
-  tvm::Tensor actual_B = B;
-
-  if (ctx_nuphar.IsInitializer(B_initializer_name) && !trans_b) {
-    auto layout_key = tvm_codegen::WeightLayoutTranspose2D::GetKey(proto_type);
-    actual_B = ctx_nuphar.ApplyWeightLayout(layout_key, B_initializer_name, B, true);
-    trans_b = true;
+    const Node& node,
+    tvm_codegen::CodeGenContext& ctx_codegen) {
+  NupharCodeGenCtx* ctx_nuphar = Promote<NupharCodeGenCtx>(&ctx_codegen);
+
+  // try to fuse tranpose in MatMul input with MatMul
+  auto find_transposed_input = [&ctx_nuphar](const tvm::Tensor& t, std::vector<int32_t>& cumulated_permute) {
+    tvm::Tensor out = t;
+    int64_t rank = gsl::narrow<int64_t>(t->shape.size());
+    std::vector<int64_t> default_node_perm(rank);
+    cumulated_permute.resize(rank);
+    for (int64_t i = 0; i < rank; ++i) {
+      cumulated_permute[i] = gsl::narrow<int32_t>(i);
+      default_node_perm[i] = rank - i - 1;
+    }
+    for (const Node* root_node = ctx_nuphar->FindNode(out);
+         root_node != nullptr && root_node->OpType() == "Transpose";
+         root_node = ctx_nuphar->FindNode(out)) {
+      ProtoHelperNodeContext ctx(*root_node);
+      OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
+      auto perm = info.GetAttrsOrDefault("perm", default_node_perm);
+      std::vector<int32_t> updated_cumulated_permute = cumulated_permute;
+      for (int64_t dst_dim = 0; dst_dim < rank; ++dst_dim) {
+        auto src_dim = tvm_codegen::HandleNegativeAxis(perm[cumulated_permute[dst_dim]], rank);
+        updated_cumulated_permute[dst_dim] = gsl::narrow<int32_t>(src_dim);
+      }
+      cumulated_permute = updated_cumulated_permute;
+      // op corresponding to node should be Transpose
+      auto op = out->op.as<tvm::ComputeOpNode>();
+      ORT_ENFORCE(op != nullptr);
+      ORT_ENFORCE(op->InputTensors().size() == 1);
+      out = op->InputTensors()[0];
+    }
+    return out;
+  };
+
+  std::vector<int32_t> permute_A;
+  std::vector<int32_t> permute_B;
+  const std::vector<int32_t>* p_permute_A = nullptr;
+  const std::vector<int32_t>* p_permute_B = nullptr;
+  tvm::Tensor root_A = find_transposed_input(A, permute_A);
+  tvm::Tensor root_B = find_transposed_input(B, permute_B);
+  if (A->shape.size() == B->shape.size() && A->shape.size() > 2) {
+    // currently only fuse Transpose into MatMul when rank(A) == rank(B)
+    // make sure no broadcasting in MatMul
+    bool no_broadcast = true;
+    for (size_t i = 0; i < A->shape.size() - 2; ++i) {
+      if (!tvm::ir::Equal(A->shape[i], B->shape[i])) {
+        no_broadcast = false;
+        break;
+      }
+    }
+    if (no_broadcast) {
+      if (CanPermuteBeFusedInMatMul(permute_A)) {
+        A = root_A;
+        p_permute_A = &permute_A;
+      }
+      if (CanPermuteBeFusedInMatMul(permute_B)) {
+        B = root_B;
+        p_permute_B = &permute_B;
+      }
+    }
   }
 
-  return nuphar::MatMulExternCpu(A, actual_B, Y, trans_a, trans_b, name);
+  const auto& B_name = node.InputDefs()[1]->Name();
+  if (ctx_nuphar->IsInitializer(B_name) && B->shape.size() == 2) {
+    // matmul with initializer, using transpose weights
+    auto layout_key = tvm_codegen::WeightLayoutTranspose2D::GetKey(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    auto actual_B = ctx_nuphar->ApplyWeightLayout(layout_key, B_name, B, true);
+    return nuphar::GemmExternCpu(A, actual_B, Y, false, true, B_name);
+  } else {
+    return nuphar::MatMulExternCpu(A, B, Y, p_permute_A, p_permute_B, node.Name() + "_matmul_extern");
+  }
 }
 
-Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate(
+Status
+NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate(
     const tvm::Array<tvm::Tensor>& inputs,
     const Node& node,
     tvm_codegen::CodeGenContext& ctx_codegen,
@@ -123,15 +180,17 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate(
   tvm::Tensor Y;
   auto& A = inputs[0];
   auto& B = inputs[1];
-  const std::string& input_1_name = node.InputDefs()[1]->Name();
 
+  // float MatMul, try use extern
   if (A->dtype == HalideIR::Float(32) &&
       B->dtype == HalideIR::Float(32) &&
-      MatMulF32ExternCpuEx(proto_type, *ctx_nuphar, A, B, Y, input_1_name)) {
+      MatMulF32ExternCPU(A, B, Y, node, ctx_codegen)) {
     outputs.push_back(Y);
     return Status::OK();
   }
 
+  // if B is 2D initializer, use vertical stripe layout
+  const std::string& input_1_name = node.InputDefs()[1]->Name();
   if (ShapeRank(node.InputDefs()[1]) == 2 && ctx_nuphar->IsInitializer(input_1_name)) {
     if (MatMul_weights2D(proto_type, A, B, input_1_name, *ctx_nuphar, Y)) {
       outputs.push_back(Y);
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc
index ec9a22af84576..30fec1dc24c63 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc
@@ -3,55 +3,13 @@
 
 #include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
 
+#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h"
 #include "core/framework/op_kernel_info.h"
 #include "core/providers/nuphar/mti_x86/math/unary_ops.h"
 
 namespace onnxruntime {
 namespace nuphar {
 
-// helper class for unary_ops with alpha
-class FuncWithAlpha {
- public:
-  FuncWithAlpha(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-  }
-
- protected:
-  float alpha_;
-};
-
-// helper class for unary_ops with alpha and beta
-class FuncWithAlphaBeta {
- public:
-  FuncWithAlphaBeta(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("beta", &beta_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float beta_;
-};
-
-// helper class for unary_ops with alpha and gamma
-class FuncWithAlphaGamma {
- public:
-  FuncWithAlphaGamma(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("gamma", &gamma_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float gamma_;
-};
-
 // helper macro declares unary_ops helper class without attribute
 #define FuncClass(name)                                  \
   class Func##name {                                     \
@@ -64,7 +22,7 @@ class FuncWithAlphaGamma {
 
 // helper macro declares unary_ops helper class with alpha
 #define FuncClassAlpha(name)                              \
-  class Func##name : public FuncWithAlpha {               \
+  class Func##name : public tvm_codegen::FuncWithAlpha {  \
    public:                                                \
     Func##name(const Node& node) : FuncWithAlpha(node) {} \
     tvm::Tensor operator()(const tvm::Tensor& X) const {  \
@@ -74,7 +32,7 @@ class FuncWithAlphaGamma {
 
 // helper macro declares unary_ops helper class with alpha and beta
 #define FuncClassAlphaBeta(name)                              \
-  class Func##name : public FuncWithAlphaBeta {               \
+  class Func##name : public tvm_codegen::FuncWithAlphaBeta {  \
    public:                                                    \
     Func##name(const Node& node) : FuncWithAlphaBeta(node) {} \
     tvm::Tensor operator()(const tvm::Tensor& X) const {      \
@@ -84,7 +42,7 @@ class FuncWithAlphaGamma {
 
 // helper macro declares unary_ops helper class with alpha and gamma
 #define FuncClassAlphaGamma(name)                              \
-  class Func##name : public FuncWithAlphaGamma {               \
+  class Func##name : public tvm_codegen::FuncWithAlphaGamma {  \
    public:                                                     \
     Func##name(const Node& node) : FuncWithAlphaGamma(node) {} \
     tvm::Tensor operator()(const tvm::Tensor& X) const {       \
diff --git a/onnxruntime/core/providers/nuphar/kernel.cc b/onnxruntime/core/providers/nuphar/kernel.cc
index dbecb12d1a458..459894105e711 100644
--- a/onnxruntime/core/providers/nuphar/kernel.cc
+++ b/onnxruntime/core/providers/nuphar/kernel.cc
@@ -4,6 +4,7 @@
 #include "core/providers/nuphar/kernel.h"
 
 #include "core/codegen/passes/utils/codegen_context.h"
+#include "core/codegen/common/profile.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/initializer_info.h"
@@ -117,6 +118,7 @@ Status NupharKernelState::Compute(OpKernelContext* op_kernel_context) const {
   compute_ctx->Bind(op_kernel_context);
 
   for (auto* call : exec_block_calls_) {
+    CODEGEN_PROFILER_EVENT(call->Name());
     call->Run(compute_ctx);
   }
 
diff --git a/onnxruntime/core/providers/nuphar/kernel.h b/onnxruntime/core/providers/nuphar/kernel.h
index d308e9dae736e..c4f309ad77953 100644
--- a/onnxruntime/core/providers/nuphar/kernel.h
+++ b/onnxruntime/core/providers/nuphar/kernel.h
@@ -72,108 +72,109 @@ class NupharKernelState {
 
 #define DISABLE_MACRO(X)
 
-#define LIST_NUPHAR_OPS()                                                                    \
-  NUPHAR_OP(Abs, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_VERSIONED_OP(ArgMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())                \
-  NUPHAR_OP(ArgMax, 11, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  NUPHAR_VERSIONED_OP(ArgMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())                \
-  NUPHAR_OP(ArgMin, 11, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())  \
-  NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())              \
-  NUPHAR_OP(AveragePool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())              \
-  NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_VERSIONED_OP(Concat, 4, 10, DataTypeImpl::AllFixedSizeTensorTypes())                \
-  NUPHAR_OP(Concat, 11, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  DISABLE_MACRO(NUPHAR_OP(Conv, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()))       \
-  NUPHAR_OP(Crop, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_OP(Div, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Dropout, 7, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  NUPHAR_OP(Elu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                 \
-  NUPHAR_VERSIONED_OP(Equal, 7, 10, DataTypeImpl::AllFixedSizeTensorTypes())                 \
-  NUPHAR_OP(Equal, 11, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  NUPHAR_OP(Erf, 9, DataTypeImpl::GetTensorType<float>())                                    \
-  NUPHAR_OP(Exp, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                 \
-  NUPHAR_OP(Expand, 8, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  NUPHAR_VERSIONED_OP(Flatten, 1, 8, DataTypeImpl::AllIEEEFloatTensorTypes())                \
-  NUPHAR_VERSIONED_OP(Flatten, 9, 10, DataTypeImpl::AllIEEEFloatTensorTypes())               \
-  NUPHAR_OP(Flatten, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                            \
-  NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
-  NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())         \
-  NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                      \
-  NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())         \
-  NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())             \
-  NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                         \
-  NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes())                            \
-  NUPHAR_OP(LeakyRelu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
-  NUPHAR_OP(Less, 9, DataTypeImpl::AllFixedSizeTensorTypes())                                \
-  NUPHAR_OP(Log, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                 \
-  NUPHAR_VERSIONED_OP(LogSoftmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())            \
-  NUPHAR_OP(LogSoftmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                         \
-  DISABLE_MACRO(NUPHAR_OP(LSTM, 7, DataTypeImpl::AllIEEEFloatTensorTypes()))                 \
-  NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())       \
-  NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                    \
-  NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())      \
-  NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())      \
-  NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                  \
-  NUPHAR_OP(MaxPool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                  \
-  NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Pad, 2, DataTypeImpl::AllIEEEFloatTensorTypes())                                 \
-  NUPHAR_OP(ParametricSoftplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                  \
-  NUPHAR_OP(PRelu, 7, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
-  NUPHAR_OP(Relu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_OP(Reciprocal, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                          \
-  NUPHAR_VERSIONED_OP(ReduceL1, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())              \
-  NUPHAR_OP(ReduceL1, 11, DataTypeImpl::AllFixedSizeTensorTypes())                           \
-  NUPHAR_VERSIONED_OP(ReduceL2, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())              \
-  NUPHAR_OP(ReduceL2, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
-  NUPHAR_VERSIONED_OP(ReduceLogSum, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())          \
-  NUPHAR_OP(ReduceLogSum, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                       \
-  NUPHAR_VERSIONED_OP(ReduceLogSumExp, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())       \
-  NUPHAR_OP(ReduceLogSumExp, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                    \
-  NUPHAR_VERSIONED_OP(ReduceMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())             \
-  NUPHAR_OP(ReduceMax, 11, DataTypeImpl::AllFixedSizeTensorTypes())                          \
-  NUPHAR_VERSIONED_OP(ReduceMean, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
-  NUPHAR_OP(ReduceMean, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
-  NUPHAR_VERSIONED_OP(ReduceMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())             \
-  NUPHAR_OP(ReduceMin, 11, DataTypeImpl::AllFixedSizeTensorTypes())                          \
-  NUPHAR_VERSIONED_OP(ReduceProd, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
-  NUPHAR_OP(ReduceProd, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
-  NUPHAR_VERSIONED_OP(ReduceSum, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())             \
-  NUPHAR_OP(ReduceSum, 11, DataTypeImpl::AllFixedSizeTensorTypes())                          \
-  NUPHAR_VERSIONED_OP(ReduceSumSquare, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())       \
-  NUPHAR_OP(ReduceSumSquare, 11, DataTypeImpl::AllFixedSizeTensorTypes())                    \
-  NUPHAR_OP(Reshape, 5, DataTypeImpl::AllFixedSizeTensorTypes())                             \
-  NUPHAR_OP(ScaledTanh, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                          \
-  NUPHAR_OP(Selu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_OP(Shape, 1, DataTypeImpl::AllFixedSizeTensorTypes())                               \
-  NUPHAR_OP(Sigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                             \
-  NUPHAR_VERSIONED_OP(Slice, 1, 9, DataTypeImpl::AllFixedSizeTensorTypes())                  \
-  NUPHAR_OP(Slice, 10, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  NUPHAR_OP(Slice, 11, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  NUPHAR_VERSIONED_OP(Softmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())               \
-  NUPHAR_OP(Softmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                            \
-  NUPHAR_OP(Softplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                            \
-  NUPHAR_OP(Softsign, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                            \
-  NUPHAR_VERSIONED_OP(Split, 2, 10, DataTypeImpl::AllIEEEFloatTensorTypes())                 \
-  NUPHAR_OP(Split, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                              \
-  NUPHAR_VERSIONED_OP(Squeeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())               \
-  NUPHAR_OP(Squeeze, 11, DataTypeImpl::AllFixedSizeTensorTypes())                            \
-  NUPHAR_OP(Sqrt, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
-  NUPHAR_OP(Sub, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Sum, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  NUPHAR_OP(Tanh, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                \
-  NUPHAR_OP(ThresholdedRelu, 1, DataTypeImpl::AllFixedSizeTensorTypes())                     \
-  NUPHAR_OP(Tile, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                \
-  NUPHAR_OP(Transpose, 1, DataTypeImpl::AllFixedSizeTensorTypes())                           \
-  NUPHAR_VERSIONED_OP(Unsqueeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())             \
-  NUPHAR_OP(Unsqueeze, 11, DataTypeImpl::AllFixedSizeTensorTypes())                          \
+#define LIST_NUPHAR_OPS()                                                                   \
+  NUPHAR_OP(Abs, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_VERSIONED_OP(ArgMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())               \
+  NUPHAR_OP(ArgMax, 11, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes())                             \
+  NUPHAR_VERSIONED_OP(ArgMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())               \
+  NUPHAR_OP(ArgMin, 11, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
+  NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())             \
+  NUPHAR_OP(AveragePool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())             \
+  NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_VERSIONED_OP(Concat, 4, 10, DataTypeImpl::AllFixedSizeTensorTypes())               \
+  NUPHAR_OP(Concat, 11, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  DISABLE_MACRO(NUPHAR_OP(Conv, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()))      \
+  NUPHAR_OP(Crop, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_OP(Div, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Dropout, 7, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  NUPHAR_OP(Elu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
+  NUPHAR_VERSIONED_OP(Equal, 7, 10, DataTypeImpl::AllFixedSizeTensorTypes())                \
+  NUPHAR_OP(Equal, 11, DataTypeImpl::AllFixedSizeTensorTypes())                             \
+  NUPHAR_OP(Erf, 9, DataTypeImpl::GetTensorType<float>())                                   \
+  NUPHAR_OP(Exp, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
+  NUPHAR_OP(Expand, 8, DataTypeImpl::AllFixedSizeTensorTypes())                             \
+  NUPHAR_VERSIONED_OP(Flatten, 1, 8, DataTypeImpl::AllIEEEFloatTensorTypes())               \
+  NUPHAR_VERSIONED_OP(Flatten, 9, 10, DataTypeImpl::AllIEEEFloatTensorTypes())              \
+  NUPHAR_OP(Flatten, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
+  NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                              \
+  NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())        \
+  NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                     \
+  NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())        \
+  NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())            \
+  NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                        \
+  NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes())                           \
+  NUPHAR_OP(LeakyRelu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                          \
+  NUPHAR_OP(Less, 9, DataTypeImpl::AllFixedSizeTensorTypes())                               \
+  NUPHAR_OP(Log, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
+  NUPHAR_VERSIONED_OP(LogSoftmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())           \
+  NUPHAR_OP(LogSoftmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                        \
+  DISABLE_MACRO(NUPHAR_OP(LSTM, 7, DataTypeImpl::AllIEEEFloatTensorTypes()))                \
+  NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())      \
+  NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                   \
+  NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())     \
+  NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())     \
+  NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                 \
+  NUPHAR_OP(MaxPool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                 \
+  NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Pad, 2, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
+  NUPHAR_OP(ParametricSoftplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                 \
+  NUPHAR_OP(Pow, 7, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
+  NUPHAR_OP(PRelu, 7, DataTypeImpl::AllIEEEFloatTensorTypes())                              \
+  NUPHAR_OP(Relu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_OP(Reciprocal, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                         \
+  NUPHAR_VERSIONED_OP(ReduceL1, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())             \
+  NUPHAR_OP(ReduceL1, 11, DataTypeImpl::AllFixedSizeTensorTypes())                          \
+  NUPHAR_VERSIONED_OP(ReduceL2, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())             \
+  NUPHAR_OP(ReduceL2, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                          \
+  NUPHAR_VERSIONED_OP(ReduceLogSum, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())         \
+  NUPHAR_OP(ReduceLogSum, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                      \
+  NUPHAR_VERSIONED_OP(ReduceLogSumExp, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())      \
+  NUPHAR_OP(ReduceLogSumExp, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                   \
+  NUPHAR_VERSIONED_OP(ReduceMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
+  NUPHAR_OP(ReduceMax, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
+  NUPHAR_VERSIONED_OP(ReduceMean, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())           \
+  NUPHAR_OP(ReduceMean, 11, DataTypeImpl::AllFixedSizeTensorTypes())                        \
+  NUPHAR_VERSIONED_OP(ReduceMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
+  NUPHAR_OP(ReduceMin, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
+  NUPHAR_VERSIONED_OP(ReduceProd, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())           \
+  NUPHAR_OP(ReduceProd, 11, DataTypeImpl::AllFixedSizeTensorTypes())                        \
+  NUPHAR_VERSIONED_OP(ReduceSum, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
+  NUPHAR_OP(ReduceSum, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
+  NUPHAR_VERSIONED_OP(ReduceSumSquare, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())      \
+  NUPHAR_OP(ReduceSumSquare, 11, DataTypeImpl::AllFixedSizeTensorTypes())                   \
+  NUPHAR_OP(Reshape, 5, DataTypeImpl::AllFixedSizeTensorTypes())                            \
+  NUPHAR_OP(ScaledTanh, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                         \
+  NUPHAR_OP(Selu, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_OP(Shape, 1, DataTypeImpl::AllFixedSizeTensorTypes())                              \
+  NUPHAR_OP(Sigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                            \
+  NUPHAR_VERSIONED_OP(Slice, 1, 9, DataTypeImpl::AllFixedSizeTensorTypes())                 \
+  NUPHAR_OP(Slice, 10, DataTypeImpl::AllFixedSizeTensorTypes())                             \
+  NUPHAR_OP(Slice, 11, DataTypeImpl::AllFixedSizeTensorTypes())                             \
+  NUPHAR_VERSIONED_OP(Softmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes())              \
+  NUPHAR_OP(Softmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
+  NUPHAR_OP(Softplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
+  NUPHAR_OP(Softsign, 1, DataTypeImpl::AllIEEEFloatTensorTypes())                           \
+  NUPHAR_VERSIONED_OP(Split, 2, 10, DataTypeImpl::AllIEEEFloatTensorTypes())                \
+  NUPHAR_OP(Split, 11, DataTypeImpl::AllIEEEFloatTensorTypes())                             \
+  NUPHAR_VERSIONED_OP(Squeeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())              \
+  NUPHAR_OP(Squeeze, 11, DataTypeImpl::AllFixedSizeTensorTypes())                           \
+  NUPHAR_OP(Sqrt, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
+  NUPHAR_OP(Sub, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Sum, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                \
+  NUPHAR_OP(Tanh, 6, DataTypeImpl::AllFixedSizeTensorTypes())                               \
+  NUPHAR_OP(ThresholdedRelu, 1, DataTypeImpl::AllFixedSizeTensorTypes())                    \
+  NUPHAR_OP(Tile, 6, DataTypeImpl::AllFixedSizeTensorTypes())                               \
+  NUPHAR_OP(Transpose, 1, DataTypeImpl::AllFixedSizeTensorTypes())                          \
+  NUPHAR_VERSIONED_OP(Unsqueeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes())            \
+  NUPHAR_OP(Unsqueeze, 11, DataTypeImpl::AllFixedSizeTensorTypes())                         \
   NUPHAR_OP(Where, 9, DataTypeImpl::AllFixedSizeTensorTypes())
 
 }  // namespace nuphar
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc
index d18c6292495de..eb9a78ffea520 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc
@@ -174,7 +174,7 @@ tvm::Expr raise_to_integer_power(const tvm::Expr& e, int64_t p) {
  * cast to Float(32). For Float(32), cleanly vectorizable, and
  * accurate up to the last few bits of the mantissa. Gets worse when
  * approaching overflow. Vectorizes cleanly. */
-inline tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y) {
+tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y) {
   if (const int64_t* i = as_const_int(y)) {
     return raise_to_integer_power(x, *i);
   }
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h
index e421bd5071715..80ed407175d4f 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h
@@ -43,6 +43,8 @@ tvm::Expr halideir_exp(const tvm::Expr& x_full);
 
 tvm::Expr halideir_log(const tvm::Expr& x_full);
 
+tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y);
+
 tvm::Expr fast_log(const tvm::Expr& x);
 
 }  // namespace nuphar
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
index 1c30b29687843..141f70c572505 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
@@ -19,7 +19,7 @@ namespace nuphar {
 
 tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a, bool trans_b, const std::string& name) {
   tvm::Tensor Y;
-  if (MatMulExternCpu(A, B, Y, trans_a, trans_b))
+  if (GemmExternCpu(A, B, Y, trans_a, trans_b))
     return Y;
 
   return topi::matmul(A, B, trans_a, trans_b, name);
@@ -110,31 +110,79 @@ TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.batched_matmul_cpu")
       DCHECK(tvm::runtime::TypeMatch(B->dtype, kDLFloat, 32));
       DCHECK(tvm::runtime::TypeMatch(C->dtype, kDLFloat, 32));
 
-      MatMulComputeHelper helper;
-      TensorShape A_shape(A->shape, A->ndim);
-      TensorShape B_shape(B->shape, B->ndim);
-      helper.Compute(A_shape, B_shape);
-
-      size_t max_len = helper.OutputOffsets().size();
-      for (size_t i = 0; i < max_len; i++) {
-        math::MatMul<float>(
-            static_cast<int>(helper.M()),
-            static_cast<int>(helper.N()),
-            static_cast<int>(helper.K()),
-            (float*)A->data + helper.LeftOffsets()[i],
-            (float*)B->data + helper.RightOffsets()[i],
-            (float*)C->data + helper.OutputOffsets()[i],
-            nullptr);  // TODO: use thread pool from OpContext
+      if (args.num_args == 3) {
+        MatMulComputeHelper helper;
+        TensorShape A_shape(A->shape, A->ndim);
+        TensorShape B_shape(B->shape, B->ndim);
+        helper.Compute(A_shape, B_shape);
+
+        size_t max_len = helper.OutputOffsets().size();
+        for (size_t i = 0; i < max_len; i++) {
+          math::MatMul<float>(
+              static_cast<int>(helper.M()),
+              static_cast<int>(helper.N()),
+              static_cast<int>(helper.K()),
+              (float*)A->data + helper.LeftOffsets()[i],
+              (float*)B->data + helper.RightOffsets()[i],
+              (float*)C->data + helper.OutputOffsets()[i],
+              nullptr);  // TODO: use thread pool from OpContext
+        }
+      } else {
+        // matmul fused with transpose, modify lda/ldb and step_a/step_b for the zero-cost transpose
+        DCHECK(A->ndim == B->ndim);
+        DCHECK(args.num_args - 3 == A->ndim + B->ndim);
+        std::vector<int32_t> permute_A(A->ndim);
+        std::vector<int64_t> stride_A(A->ndim);
+        std::vector<int32_t> permute_B(B->ndim);
+        std::vector<int64_t> stride_B(B->ndim);
+        int arg_idx = 3;
+        int num_matmuls = 1;
+        for (int i = 0; i < A->ndim; ++i) {
+          permute_A[i] = tvm::runtime::TVMArgValue(args.values[arg_idx + i], args.type_codes[arg_idx + i]);
+          if (i < A->ndim - 2) {
+            num_matmuls *= A->shape[permute_A[i]];
+          }
+          stride_A[A->ndim - 1 - i] = (i == 0) ? 1 : stride_A[A->ndim - i] * A->shape[A->ndim - i];
+        }
+        arg_idx += A->ndim;
+        for (int i = 0; i < B->ndim; ++i) {
+          permute_B[i] = tvm::runtime::TVMArgValue(args.values[arg_idx + i], args.type_codes[arg_idx + i]);
+          stride_B[B->ndim - 1 - i] = (i == 0) ? 1 : stride_B[B->ndim - i] * B->shape[B->ndim - i];
+        }
+
+        float alpha = 1.0f;
+        float beta = 0.0f;
+        int64_t M = A->shape[permute_A[A->ndim - 2]];
+        int64_t K = A->shape[permute_A[A->ndim - 1]];
+        int64_t N = B->shape[permute_B[B->ndim - 1]];
+        bool trans_a = (permute_A[A->ndim - 2] == A->ndim - 1);
+        bool trans_b = (permute_B[B->ndim - 2] == B->ndim - 1);
+        int64_t step_a = stride_A[permute_A[A->ndim - 3]];
+        int64_t lda = stride_A[permute_A[A->ndim - (trans_a ? 1 : 2)]];
+        int64_t step_b = stride_B[permute_B[B->ndim - 3]];
+        int64_t ldb = stride_B[permute_B[B->ndim - (trans_b ? 1 : 2)]];
+
+        for (int i = 0; i < num_matmuls; i++) {
+          math::GemmEx<float, concurrency::ThreadPool>(
+              trans_a ? CblasTrans : CblasNoTrans,
+              trans_b ? CblasTrans : CblasNoTrans,
+              M,
+              N,
+              K,
+              alpha,
+              (float*)A->data + i * step_a,
+              lda,
+              (float*)B->data + i * step_b,
+              ldb,
+              beta,
+              (float*)C->data + i * M * N,
+              N,
+              nullptr);  // TODO: use thread pool from OpContext
+        }
       }
     });
 
-bool MatMulExternCpu(
-    const tvm::Tensor& A,
-    const tvm::Tensor& B,
-    tvm::Tensor& Y,
-    bool trans_a,
-    bool trans_b,
-    const std::string& name) {
+static bool ShouldUseMatMulExtern() {
   // Note: currently default behavior is always prefer extern
   const codegen::CodeGenSettings& settings = codegen::CodeGenSettings::Instance();
   if (settings.HasOption(kNupharMatmulExec)) {
@@ -144,6 +192,30 @@ bool MatMulExternCpu(
     if (!prefer_extern)
       return false;
   }
+  return true;
+}
+
+bool CanPermuteBeFusedInMatMul(const std::vector<int32_t>& perm) {
+  auto rank = gsl::narrow<int32_t>(perm.size());
+  if (rank < 2) return true;
+
+  // only fusable if inner-most dim could be transposed
+  return (perm[rank - 1] == rank - 1) ||
+         (perm[rank - 2] == rank - 1);
+};
+
+bool GemmExternCpu(
+    const tvm::Tensor& A,
+    const tvm::Tensor& B,
+    tvm::Tensor& Y,
+    bool trans_a,
+    bool trans_b,
+    const std::string& name) {
+  if (!ShouldUseMatMulExtern())
+    return false;
+
+  if (A->shape.size() == 1 && B->shape.size() == 1)
+    return false;  // TVM extern cannot have output shape being empty
 
   // TODO: add support for mixed precisions
   if (A->dtype != B->dtype ||
@@ -151,69 +223,85 @@ bool MatMulExternCpu(
       A->dtype.bits() != 32)
     return false;
 
-  // inputs need to be at least 1D
-  auto rank_A = A->shape.size();
-  auto rank_B = B->shape.size();
-  if (rank_A < 1 || rank_B < 1)
+  tvm::Array<tvm::Expr> out_shape = tvm_codegen::ComputeMatMulShape(A->shape, B->shape, trans_a, trans_b);
+
+  Y = topi::detail::make_extern(
+      {out_shape}, {A->dtype}, {A, B},
+      [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
+        return topi::detail::call_packed(
+            {tvm::Expr("tvm.contrib.onnxruntime.sgemm_cpu"),
+             topi::detail::pack_buffer(ins[0]),
+             topi::detail::pack_buffer(ins[1]),
+             topi::detail::pack_buffer(outs[0]),
+             trans_a,
+             trans_b});
+      },
+      name, "", {})[0];
+
+  return true;
+}
+
+bool MatMulExternCpu(
+    const tvm::Tensor& A,
+    const tvm::Tensor& B,
+    tvm::Tensor& Y,
+    const std::vector<int32_t>* permute_A,
+    const std::vector<int32_t>* permute_B,
+    const std::string& name) {
+  if (permute_A != nullptr) {
+    ORT_ENFORCE(permute_B != nullptr);
+    ORT_ENFORCE(CanPermuteBeFusedInMatMul(*permute_A));
+    ORT_ENFORCE(CanPermuteBeFusedInMatMul(*permute_B));
+    ORT_ENFORCE(permute_A->size() == permute_B->size());
+    ORT_ENFORCE(permute_A->size() == A->shape.size());
+    ORT_ENFORCE(permute_B->size() == B->shape.size());
+  }
+
+  // TODO: add support for mixed precisions
+  if (A->dtype != B->dtype ||
+      !A->dtype.is_float() ||
+      A->dtype.bits() != 32)
     return false;
 
-  // only allow trans_a for 2D inputs
-  if (rank_A != 2 && trans_a)
+  // inputs need to be at least 1D
+  auto rank_A = gsl::narrow<int32_t>(A->shape.size());
+  auto rank_B = gsl::narrow<int32_t>(B->shape.size());
+
+  if (rank_A < 1 || rank_B < 1)
     return false;
 
   // do not support 1-D x 1-D as tvm extern require buffer size > 0
   if (rank_A == 1 && rank_B == 1)
     return false;
 
-  tvm::Array<tvm::Expr> out_shape;
-  if (rank_A == 1) {
-    // 1-D x N-D
-    if (trans_b) {
-      ORT_ENFORCE(rank_B == 2);
-      out_shape.push_back(B->shape[0]);
-    } else {
-      for (size_t d = 0; d < rank_B - 2; ++d)
-        out_shape.push_back(B->shape[d]);
-      out_shape.push_back(B->shape[rank_B - 1]);
-    }
-  } else if (rank_B == 1) {
-    // N-D x 1-D
-    for (size_t d = 0; d < rank_A - 1; ++d)
-      out_shape.push_back(A->shape[d]);
-  } else {
-    // N-D x N-D
-    if (rank_B == 2) {
-      if (trans_a) {
-        // trans_a is only allowed for 2D
-        out_shape.push_back(A->shape[rank_A - 1]);
-      } else {
-        for (size_t d = 0; d < rank_A - 1; ++d)
-          out_shape.push_back(A->shape[d]);
-      }
-      out_shape.push_back(B->shape[trans_b ? rank_B - 2 : rank_B - 1]);
-    } else {
-      ORT_ENFORCE(!trans_a && !trans_b);
-      // batched matmul
-      out_shape = tvm_codegen::ComputeMatMulShape(A->shape, B->shape);
-    }
+  tvm::Array<tvm::Expr> matmul_A_shape, matmul_B_shape;
+  for (int32_t d = 0; d < rank_A; ++d) {
+    matmul_A_shape.push_back(A->shape[permute_A != nullptr ? permute_A->at(d) : d]);
+  }
+  for (int32_t d = 0; d < rank_B; ++d) {
+    matmul_B_shape.push_back(B->shape[permute_B != nullptr ? permute_B->at(d) : d]);
   }
 
+  tvm::Array<tvm::Expr> out_shape;
+  out_shape = tvm_codegen::ComputeMatMulShape(matmul_A_shape, matmul_B_shape);
+
   Y = topi::detail::make_extern(
       {out_shape}, {A->dtype}, {A, B},
       [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
-        if (rank_B <= 2) {
-          return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.sgemm_cpu"),
-                                            topi::detail::pack_buffer(ins[0]),
-                                            topi::detail::pack_buffer(ins[1]),
-                                            topi::detail::pack_buffer(outs[0]),
-                                            trans_a,
-                                            trans_b});
-        } else {
-          return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.batched_matmul_cpu"),
-                                            topi::detail::pack_buffer(ins[0]),
-                                            topi::detail::pack_buffer(ins[1]),
-                                            topi::detail::pack_buffer(outs[0])});
+        tvm::Array<tvm::Expr> extern_args = {
+            tvm::Expr("tvm.contrib.onnxruntime.batched_matmul_cpu"),
+            topi::detail::pack_buffer(ins[0]),
+            topi::detail::pack_buffer(ins[1]),
+            topi::detail::pack_buffer(outs[0])};
+        if (permute_A != nullptr && permute_B != nullptr) {
+          for (const auto& perm_A : *permute_A) {
+            extern_args.push_back(perm_A);
+          }
+          for (const auto& perm_B : *permute_B) {
+            extern_args.push_back(perm_B);
+          }
         }
+        return topi::detail::call_packed(extern_args);
       },
       name, "", {})[0];
 
@@ -222,7 +310,7 @@ bool MatMulExternCpu(
 
 tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name) {
   tvm::Tensor Y;
-  if (MatMulExternCpu(A, B, Y))
+  if (GemmExternCpu(A, B, Y))
     return Y;
   // go through generic case otherwise
   return tvm_codegen::MatMul(A, B, name);
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h
index 53484dc7f5605..97c76cbb140fc 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h
@@ -10,7 +10,7 @@ namespace nuphar {
 
 tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d");
 
-bool MatMulExternCpu(
+bool GemmExternCpu(
     const tvm::Tensor& A,
     const tvm::Tensor& B,
     tvm::Tensor& Y,
@@ -18,6 +18,16 @@ bool MatMulExternCpu(
     bool trans_b = false,
     const std::string& name = "matmul_extern_cpu");
 
+bool MatMulExternCpu(
+    const tvm::Tensor& A,
+    const tvm::Tensor& B,
+    tvm::Tensor& Y,
+    const std::vector<int32_t>* permute_A,
+    const std::vector<int32_t>* permute_B,
+    const std::string& name = "matmul_permute_extern_cpu");
+
+bool CanPermuteBeFusedInMatMul(const std::vector<int32_t>& perm);
+
 tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name);
 
 }  // namespace nuphar
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc
new file mode 100644
index 0000000000000..006d5f0a98035
--- /dev/null
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/nuphar/mti_x86/math/halide_ops.h"
+#include "topi/broadcast.h"
+#include "tvm/ir.h"
+
+namespace onnxruntime {
+namespace nuphar {
+
+tvm::Tensor Pow(tvm::Tensor A, tvm::Tensor B, const std::string& name = "pow") {
+  return topi::power(A, B);
+}
+
+tvm::Tensor Pow(tvm::Tensor A, tvm::Expr B, const std::string& name = "pow") {
+  // special case for integer pow passed in
+  const tvm::ir::FloatImm* op = B.as<tvm::ir::FloatImm>();
+  if (op != nullptr) {
+    int64_t i = (int64_t)(op->value);
+    if ((double)i == op->value) {
+      B = tvm::make_const(HalideIR::Int(64), i);  // replace B with integer for halideir_pow
+    }
+  }
+  return tvm::compute(
+      A->shape,
+      [&](const tvm::Array<tvm::Var>& indices) {
+        return halideir_pow(A(indices), B);
+      },
+      name);
+}
+
+tvm::Tensor Pow(tvm::Expr A, tvm::Tensor B, const std::string& name = "pow") {
+  return tvm::compute(
+      B->shape,
+      [&](const tvm::Array<tvm::Var>& indices) {
+        return halideir_pow(A, B(indices));
+      },
+      name);
+}
+
+}  // namespace nuphar
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h
new file mode 100644
index 0000000000000..339a75cad8464
--- /dev/null
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+#include <tvm/tvm.h>
+
+namespace onnxruntime {
+namespace nuphar {
+
+tvm::Tensor Pow(tvm::Tensor A, tvm::Tensor B, const std::string& name = "pow");
+tvm::Tensor Pow(tvm::Expr A, tvm::Tensor B, const std::string& name = "pow");
+tvm::Tensor Pow(tvm::Tensor A, tvm::Expr B, const std::string& name = "pow");
+
+}  // namespace nuphar
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
index 2632434cfdeb6..7ffe9322bc2d5 100644
--- a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
+++ b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
@@ -189,7 +189,12 @@ Status SubgraphPartitioner::Partition(
           bool unused_initializer = false;
           if (t != nullptr) {
             // note for Reshape and Tile, shape/repeats as initializer is not used at runtime
-            unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1);
+            // scalar initializers in binary ops are not used at runtime either
+            static const std::unordered_set<std::string> binary_ops =
+                {"Add", "Div", "Sub", "Mul", "Pow", "Equal", "Greater", "Less"};
+
+            unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1) ||
+                                 (binary_ops.count(node.OpType()) > 0 && t->Shape().Size() == 1);
 
             if (!unused_initializer) {
               subgraph.initializers.emplace(def.Name(), t);
diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
index 0f900b595bec5..b59bc9a9411df 100644
--- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
@@ -109,7 +109,7 @@ def __init__(self, int_max, auto_merge, verbose):
         self.verbose_ = verbose
         self.int_max_ = int_max
 
-    def _add_suggested_merge(self, symbols):
+    def _add_suggested_merge(self, symbols, apply=False):
         assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
         symbols = set(symbols)
         for k,v in self.suggested_merge_.items():
@@ -142,11 +142,13 @@ def _add_suggested_merge(self, symbols):
             for k,v in self.suggested_merge_.items():
                 if v == s:
                     self.suggested_merge_[k] = map_to
+        if apply and self.auto_merge_:
+            self._apply_suggested_merge()
 
-    def _apply_suggested_merge_to_graph_input(self):
+    def _apply_suggested_merge(self, graph_input_only=False):
         if not self.suggested_merge_:
             return
-        for i in self.out_mp_.graph.input:
+        for i in list(self.out_mp_.graph.input) + ([] if graph_input_only else list(self.out_mp_.graph.value_info)):
             for d in i.type.tensor_type.shape.dim:
                 if d.dim_param in self.suggested_merge_:
                     v = self.suggested_merge_[d.dim_param]
@@ -478,7 +480,7 @@ def _compute_matmul_shape(self, node, output_dtype=None):
         # record inconsistent reduce dim as suggested merge
         if lhs_shape[lhs_reduce_dim] != rhs_shape[rhs_reduce_dim]:
             merge_dims = [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]]
-            self._add_suggested_merge(merge_dims)
+            self._add_suggested_merge(merge_dims, apply=True)
         if output_dtype is None:
             # infer output_dtype from input type when not specified
             output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
@@ -954,7 +956,7 @@ def _infer_ZipMap(self, node):
     def _infer_impl(self, in_mp):
         self.sympy_data_ = {}
         self.out_mp_.graph.ClearField('value_info')
-        self._apply_suggested_merge_to_graph_input()
+        self._apply_suggested_merge(graph_input_only=True)
         input_symbols = set()
         for i in self.out_mp_.graph.input:
             input_symbols.update([d for d in get_shape_from_type_proto(i.type) if type(d) == str])

From 6b6f61eb1e81d8f7a6f9a47984788869a2d3736b Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Tue, 12 Nov 2019 12:01:28 -0800
Subject: [PATCH 02/11] Remove some unnecessary changes

---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 4a8a4c3f7e189..52f0d60458ef9 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -188,7 +188,7 @@ if (MSVC)
   SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100")
   if (NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
-    #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
+    SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
   endif()
   check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
   if (HAS_QSPECTRE)

From d699e29f062d722afe5e60b4f3dc1c22c861de70 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Tue, 12 Nov 2019 16:32:25 -0800
Subject: [PATCH 03/11] Address CR and update test

---
 .../x86/op_ir_creator/math/binary_ops.cc      |  2 +-
 .../python/onnxruntime_test_python_nuphar.py  | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
index 2e5ed9b2551de..9af9a37326baf 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
@@ -28,7 +28,7 @@ bool HandleConstantScalar(tvm::Expr& scalar, size_t i, const Node& node, CodeGen
     return false;  // return if not constant or not scalar
 
 #define ASSIGN_TVM_SCALAR(tvm_type, tensor_type)                      \
-  if (utils::IsPrimitiveDataType<tensor_type>(tensor->DataType())) {  \
+  if (tensor->IsDataType<tensor_type>()) {                            \
     scalar = tvm::make_const(tvm_type, *tensor->Data<tensor_type>()); \
   }
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nuphar.py b/onnxruntime/test/python/onnxruntime_test_python_nuphar.py
index 46415e5399962..a72050d70f80c 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_nuphar.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_nuphar.py
@@ -90,6 +90,34 @@ def test_bidaf(self):
         sess.run([], feed)
 
 
+    def test_bert_squad(self):
+        # download BERT_squad model
+        cwd = os.getcwd()
+        bert_squad_url = 'https://onnxzoo.blob.core.windows.net/models/opset_10/bert_squad/download_sample_10.tar.gz'
+        cache_dir = os.path.join(os.path.expanduser("~"), '.cache','onnxruntime')
+        os.makedirs(cache_dir, exist_ok=True)
+        bert_squad_local = os.path.join(cache_dir, 'bert_squad.tar.gz')
+        if not os.path.exists(bert_squad_local):
+            urllib.request.urlretrieve(bert_squad_url, bert_squad_local)
+        with tarfile.open(bert_squad_local, 'r') as f:
+            f.extractall(cwd)
+
+        # run symbolic shape inference on this model
+        # set int_max to 1,000,000 to simplify symbol computes for things like min(1000000, seq_len) -> seq_len
+        bert_squad_dir = os.path.join(cwd, 'download_sample_10')
+        bert_squad_model = os.path.join(bert_squad_dir, 'bertsquad10.onnx')
+        subprocess.run([sys.executable, '-m', 'onnxruntime.nuphar.symbolic_shape_infer', '--input', bert_squad_model, '--output', bert_squad_model, '--auto_merge', '--int_max=1000000'], check=True, cwd=cwd)
+
+        # run onnx_test_runner to verify results
+        onnx_test_runner = os.path.join(cwd, 'onnx_test_runner')
+        subprocess.run([onnx_test_runner, '-e', 'nuphar', '-n', 'download_sample_10', cwd], check=True, cwd=cwd)
+
+        # run onnxruntime_perf_test
+        onnx_test_runner = os.path.join(cwd, 'onnxruntime_perf_test')
+        subprocess.run([onnx_test_runner, '-e', 'nuphar', '-t', '20', bert_squad_model, '1.txt'], check=True, cwd=cwd)
+        subprocess.run([onnx_test_runner, '-e', 'cpu', '-o', '99', '-t', '20', bert_squad_model, '1.txt'], check=True, cwd=cwd)
+
+
     def test_rnn_benchmark(self):
         # make sure benchmarking scripts works
         # note: quantized model requires AVX2, otherwise it might be slow
@@ -106,5 +134,6 @@ def test_rnn_benchmark(self):
                                                 layers=3, seq_len=16, batch_size=2,
                                                 min_duration_seconds=1)
 
+
 if __name__ == '__main__':
     unittest.main()

From e64e4b632deb154ae3376b82abcf6afd659e24f3 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Tue, 12 Nov 2019 23:26:09 -0800
Subject: [PATCH 04/11] Vectorize ReduceMean A better scalar initializer
 handling

---
 .../nuphar/common/nuphar_tvm_utils.cc         |  33 ++++++
 .../nuphar/common/nuphar_tvm_utils.h          |   6 +-
 .../nuphar/compiler/codegen_manager.cc        |   8 --
 .../nuphar/compiler/nuphar_codegen_ctx.h      |  11 ++
 .../nuphar/compiler/nuphar_op_ir_builder.cc   |  34 ++++++
 .../compiler/nuphar_schedule_builder.cc       |   2 +
 .../compiler/x86/op_ir_creator/all_ops.h      |  19 +---
 .../x86/op_ir_creator/math/binary_ops.cc      | 105 ------------------
 .../compiler/x86/op_ir_creator/math/matmul.cc |   2 +-
 .../compiler/x86/op_ir_creator/math/pow.cc    |  46 ++++++++
 .../nuphar/mti_x86/math/reduce_ops.cc         |  17 +++
 .../nuphar/mti_x86/math/reduce_ops.h          |   7 ++
 .../nuphar/partition/subgraph_partitioner.cc  |   7 +-
 13 files changed, 159 insertions(+), 138 deletions(-)
 delete mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
 create mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
index 90d3c6018dca1..7ea0407c73d5c 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
@@ -170,5 +170,38 @@ std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const
   return NormalizeCppName("_" + subgraph.UniqueId() + " " + codegen_target.GetTargetName());
 }
 
+bool TryCreateConstantScalar(
+    tvm::Expr& scalar,
+    const Tensor* tensor) {
+  if (!tensor || tensor->Shape().Size() > 1)
+    return false;  // return if not constant or not scalar
+
+#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type)                      \
+  if (tensor->IsDataType<tensor_type>()) {                            \
+    scalar = tvm::make_const(tvm_type, *tensor->Data<tensor_type>()); \
+  }
+
+#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \
+  else ASSIGN_TVM_SCALAR(tvm_type, tensor_type)
+
+  ASSIGN_TVM_SCALAR(HalideIR::Float(32), float)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(16), int16_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(8), int8_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(16), uint16_t)
+  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(8), uint8_t)
+  else {
+    return false;
+  }
+
+#undef ASSIGN_TVM_SCALAR
+
+  return true;
+}
+
 }  // namespace nuphar
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h
index 3c26a0c6f61f9..614e1ac542553 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h
@@ -8,7 +8,10 @@
 #include "core/graph/graph.h"
 
 namespace onnxruntime {
-class CodeGenTarget;  //forward
+
+//forward
+class CodeGenTarget;
+class Tensor;
 
 namespace nuphar {
 
@@ -22,5 +25,6 @@ void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& mod
 
 std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target);
 
+bool TryCreateConstantScalar(tvm::Expr& scalar, const Tensor* tensor);
 }  // namespace nuphar
 }  //  namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
index 3879bda9fe66e..582ada8f3b944 100644
--- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
@@ -30,8 +30,6 @@ namespace nuphar {
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
 
 static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_registry) {
   LIST_ALL_X86_OPS()
@@ -41,8 +39,6 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re
 #undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP
-#undef BINARY_OP
-#undef BINARY_CMP_OP
 
 // END: NupharTVM X86 IR creator classes
 
@@ -142,8 +138,6 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
 
 static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIRBuilder>& builder,
                                         const tvm_codegen::OpIRRegistry* registry) {
@@ -156,8 +150,6 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIR
 #undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP
-#undef BINARY_OP
-#undef BINARY_CMP_OP
 // END: Nuphar TVM X86 IR creator classes
 
 // 2 Plugin Scheduler classes
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.h b/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.h
index 69ffa04adb3cd..354c7ca975dbf 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.h
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.h
@@ -12,6 +12,7 @@
 #include "core/providers/nuphar/compiler/initializer_info.h"
 #include "core/providers/nuphar/compiler/nuphar_handle.h"
 
+#include <set>
 #include <tvm/tvm.h>
 
 namespace onnxruntime {
@@ -121,7 +122,17 @@ class NupharCodeGenCtx : public tvm_codegen::CodeGenContext {
     return tvm_tensor_ctx_;
   }
 
+  void InsertLiteral(const std::string& str) {
+    literalized_scalars_.insert(str);
+  }
+
+  bool CheckLiteral(const std::string& str) {
+    return literalized_scalars_.count(str) > 0;
+  }
+
  private:
+  std::set<std::string> literalized_scalars_;
+
   std::unique_ptr<NupharSubgraphUnitStats> graph_stats_;
 
   const NupharCodeGenHandle* nuphar_handle_;
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc
index 6c7567e1ddca7..e5932d32a0809 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc
@@ -8,6 +8,7 @@
 #include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h"
 #include "core/codegen/passes/utils/ort_tvm_utils.h"
 #include "core/common/common.h"
+#include "core/providers/nuphar/common/nuphar_tvm_utils.h"
 #include "core/providers/nuphar/compiler/initializer_info.h"
 #include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
 
@@ -28,6 +29,10 @@ static const tvm::Tensor& GetOrCreateInitializer(const NodeArg* def,
                                                  bool is_sliced,
                                                  NupharCodeGenCtx& ctx_codegen);
 
+static bool CreateScalarTensorFromInitializer(const Tensor* tensor,
+                                              const std::string& name,
+                                              NupharCodeGenCtx& ctx_codegen);
+
 // CreateInputPlaceholder create tvm input placeholder (tvm::Tensor)
 // NOTE: here we assume axis 0 is sequence
 // TODO: add support for sequence not axis 0
@@ -51,6 +56,12 @@ static bool CreateInput(
     return false;
 
   ORT_ENFORCE(def->Shape());
+
+  if (nullptr != initialized_tensor &&
+      CreateScalarTensorFromInitializer(initialized_tensor, def->Name(), ctx_codegen)) {
+    return false;  // constant scalar tensor do not need to be in input
+  }
+
   if (nullptr != initialized_tensor) {
     input = GetOrCreateInitializer(def, initialized_tensor, is_sliced, ctx_codegen);
   } else {
@@ -68,6 +79,29 @@ static bool CreateInput(
   return true;
 }
 
+bool CreateScalarTensorFromInitializer(const Tensor* tensor,
+                                       const std::string& name,
+                                       NupharCodeGenCtx& ctx_codegen) {
+  TVMTensorCtx& ctx_tensor = ctx_codegen.GetTVMTensorCtx();
+  ORT_ENFORCE(tensor != nullptr);
+
+  tvm::Expr constant_scalar;
+  if (!TryCreateConstantScalar(constant_scalar, tensor))
+    return false;
+
+  std::string normalized_name = NormalizeCppName(name);
+  auto tvm_tensor = tvm::compute(
+      tvm_codegen::ToTvmArray(tensor->Shape().GetDims()),
+      [&](const tvm::Array<tvm::Var>&) {
+        return constant_scalar;
+      },
+      normalized_name);
+
+  ctx_codegen.InsertLiteral(normalized_name);
+  ctx_tensor.inputs.emplace(name, std::move(tvm_tensor));
+  return true;
+}
+
 // GetOrCreateInitializer create tvm::placeholder for a marshalled weight
 // with correpsonding data layout transfomration for a weight,
 // Note the weight is fed during build
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
index 2755f0c01aed1..57432e2b615e4 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
@@ -47,6 +47,8 @@ static void Traverse(const tvm::Tensor& tensor,
     if (t->op->InputTensors().size() > 0) {
       auto current_node = ctx_codegen.FindNode(t);
       Traverse(t, current_node, ctx_codegen, ctx_schedule);
+    } else if (ctx_codegen.CheckLiteral(t->op->name)) {
+      TryInlineSchedule(t, ctx_schedule);
     }
   }
 }
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
index 5a37ac21a9767..c06961633f9e5 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
@@ -38,18 +38,6 @@ namespace nuphar {
   UNARY_OP(Softplus)           \
   UNARY_OP(Tanh)
 
-#define LIST_X86_BINARY_OPS() \
-  BINARY_OP(Add)              \
-  BINARY_OP(Div)              \
-  BINARY_OP(Mul)              \
-  BINARY_OP(Pow)              \
-  BINARY_OP(Sub)
-
-#define LIST_X86_BINARY_CMP_OPS() \
-  BINARY_CMP_OP(Equal)            \
-  BINARY_CMP_OP(Greater)          \
-  BINARY_CMP_OP(Less)
-
 #define LIST_REDUCE_V_OPS() \
   REDUCE_V_OP(ReduceMax)    \
   REDUCE_V_OP(ReduceMin)    \
@@ -58,14 +46,13 @@ namespace nuphar {
 #define LIST_ALL_X86_OPS()     \
   LIST_REDUCE_V_OPS()          \
   LIST_X86_POOL_OPS()          \
-  LIST_X86_BINARY_OPS()        \
-  LIST_X86_BINARY_CMP_OPS()    \
   LIST_X86_UNARY_OPS()         \
   ADD_OP_ITEM(Gemm)            \
   ADD_OP_ITEM(LogSoftmax)      \
   ADD_OP_ITEM(MatMul)          \
   ADD_OP_ITEM(MatMulInteger)   \
   ADD_OP_ITEM(MatMulInteger16) \
+  ADD_OP_ITEM(Pow)             \
   ADD_OP_ITEM(Scatter)         \
   ADD_OP_ITEM(ScatterElements) \
   ADD_OP_ITEM(Slice)           \
@@ -76,8 +63,6 @@ namespace nuphar {
 #define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP)
 #define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(OP) ADD_OP_ITEM(OP)
-#define BINARY_OP(OP) ADD_OP_ITEM(OP)
-#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP)
 #define UNARY_OP(OP) ADD_OP_ITEM(OP)
 
 LIST_ALL_X86_OPS()
@@ -85,8 +70,6 @@ LIST_ALL_X86_OPS()
 #undef ADD_OP_ITEM
 #undef REDUCE_V_OP
 #undef POOL_OP
-#undef BINARY_OP
-#undef BINARY_CMP_OP
 #undef UNARY_OP
 
 }  // namespace nuphar
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
deleted file mode 100644
index 9af9a37326baf..0000000000000
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/tensor/cast_ops.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/common.h"
-#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
-#include "core/providers/nuphar/mti_x86/math/pow.h"
-
-namespace onnxruntime {
-using namespace tvm_codegen;
-
-namespace nuphar {
-
-bool HandleConstantScalar(tvm::Expr& scalar, size_t i, const Node& node, CodeGenContext& ctx_codegen) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-  NupharCodeGenCtx* ctx_nuphar = Promote<NupharCodeGenCtx>(&ctx_codegen);
-
-  ORT_ENFORCE(i < node.InputDefs().size());
-  const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name());
-
-  if (!tensor || tensor->Shape().Size() > 1)
-    return false;  // return if not constant or not scalar
-
-#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type)                      \
-  if (tensor->IsDataType<tensor_type>()) {                            \
-    scalar = tvm::make_const(tvm_type, *tensor->Data<tensor_type>()); \
-  }
-
-#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \
-  else ASSIGN_TVM_SCALAR(tvm_type, tensor_type)
-
-  ASSIGN_TVM_SCALAR(HalideIR::Float(32), float)
-  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t)
-  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t)
-  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t)
-  ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t)
-  ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double)
-  else {
-    return false;
-  }
-
-#undef ASSIGN_TVM_SCALAR
-
-  return true;
-}
-
-// helper local macro defines Evaluate of BINARY_OP OpIRCreators
-#define BINARY_OP(name)                                                     \
-  Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate(                \
-      const tvm::Array<tvm::Tensor>& inputs,                                \
-      const Node& node,                                                     \
-      CodeGenContext& ctx_codegen,                                          \
-      tvm::Array<tvm::Tensor>& outputs) {                                   \
-    tvm::Expr scalar0, scalar1;                                             \
-    bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \
-    bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \
-    tvm::Tensor Y;                                                          \
-    if (use_scalar0)                                                        \
-      Y = name(scalar0, inputs[1], node.Name());                            \
-    else if (use_scalar1)                                                   \
-      Y = name(inputs[0], scalar1, node.Name());                            \
-    else                                                                    \
-      Y = name(inputs[0], inputs[1], node.Name());                          \
-    outputs.push_back(Y);                                                   \
-    return Status::OK();                                                    \
-  }
-
-LIST_X86_BINARY_OPS()
-
-#undef BINARY_OP
-
-// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators
-#define BINARY_CMP_OP(name)                                                 \
-  Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate(                \
-      const tvm::Array<tvm::Tensor>& inputs,                                \
-      const Node& node,                                                     \
-      CodeGenContext& ctx_codegen,                                          \
-      tvm::Array<tvm::Tensor>& outputs) {                                   \
-    tvm::Expr scalar0, scalar1;                                             \
-    bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \
-    bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \
-    tvm::Tensor Y;                                                          \
-    if (use_scalar0)                                                        \
-      Y = name(scalar0, inputs[1], node.Name());                            \
-    else if (use_scalar1)                                                   \
-      Y = name(inputs[0], scalar1, node.Name());                            \
-    else                                                                    \
-      Y = name(inputs[0], inputs[1], node.Name());                          \
-    Y = Cast(Y, HalideIR::UInt(8), "cast_bool_" #name);                     \
-    outputs.push_back(Y);                                                   \
-    return Status::OK();                                                    \
-  }
-
-LIST_X86_BINARY_CMP_OPS()
-
-#undef BINARY_CMP_OP
-
-}  // namespace nuphar
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
index f766fe98ad7c1..9351aa996aab4 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc
@@ -134,7 +134,7 @@ static bool MatMulF32ExternCPU(
   const std::vector<int32_t>* p_permute_B = nullptr;
   tvm::Tensor root_A = find_transposed_input(A, permute_A);
   tvm::Tensor root_B = find_transposed_input(B, permute_B);
-  if (A->shape.size() == B->shape.size() && A->shape.size() > 2) {
+  if (A->shape.size() == B->shape.size() && A->shape.size() >= 2) {
     // currently only fuse Transpose into MatMul when rank(A) == rank(B)
     // make sure no broadcasting in MatMul
     bool no_broadcast = true;
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc
new file mode 100644
index 0000000000000..7b1d04d1a944c
--- /dev/null
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/nuphar/common/nuphar_tvm_utils.h"
+#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
+#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
+#include "core/providers/nuphar/mti_x86/math/pow.h"
+
+namespace onnxruntime {
+namespace nuphar {
+
+Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Pow)::Evaluate(
+    const tvm::Array<tvm::Tensor>& inputs,
+    const Node& node,
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm::Array<tvm::Tensor>& outputs) {
+  ORT_ENFORCE(inputs.size() == 2);
+
+  struct {
+    tvm::Expr expr;
+    bool is_scalar;
+  } constant_scalars[2];
+
+  for (size_t i = 0; i < 2; ++i) {
+    ProtoHelperNodeContext ctx(node);
+    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
+    NupharCodeGenCtx* ctx_nuphar = Promote<NupharCodeGenCtx>(&ctx_codegen);
+
+    ORT_ENFORCE(i < node.InputDefs().size());
+    const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name());
+    constant_scalars[i].is_scalar = TryCreateConstantScalar(constant_scalars[i].expr, tensor);
+  }
+  tvm::Tensor Y;
+  if (constant_scalars[0].is_scalar)
+    Y = Pow(constant_scalars[0].expr, inputs[1], node.Name());
+  else if (constant_scalars[1].is_scalar)
+    Y = Pow(inputs[0], constant_scalars[1].expr, node.Name());
+  else
+    Y = Pow(inputs[0], inputs[1], node.Name());
+  outputs.push_back(Y);
+  return Status::OK();
+}
+
+}  // namespace nuphar
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
index d78d2473d81e2..3e193517a71cf 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@@ -305,6 +305,23 @@ tvm::Tensor ReduceMin(const tvm::Tensor& X,
                      X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name);
 }
 
+tvm::Tensor ReduceMean(const tvm::Tensor& X,
+                       const std::vector<int64_t>& axes, bool keep_dims,
+                       const int32_t vector_size,
+                       bool last_dim_aligned,
+                       int32_t fuse_dim,
+                       const std::string& name) {
+  tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims,
+                                X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name);
+
+  tvm::Expr count = tvm::make_const(HalideIR::Int(32), 1);
+  for (auto ax : axes) {
+    ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size());
+    count = count * X->shape[ax];
+  }
+  return topi::divide(sum, tvm::cast(X->dtype, count));
+}
+
 // [WIP] a special vectorization friendly value reduction
 // Keep_dim always true
 tvm::Tensor ReduceValueLowest_noPad(const tvm::Tensor& X,
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h
index 4f15255df1ab5..eca60c2eb1cbe 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h
@@ -36,5 +36,12 @@ tvm::Tensor ReduceMin(const tvm::Tensor& X,
                       int32_t fuse_dim = 0,
                       const std::string& name = "reduce_min_v");
 
+tvm::Tensor ReduceMean(const tvm::Tensor& X,
+                       const std::vector<int64_t>& axes, bool keep_dims,
+                       const int32_t vector_size,
+                       bool last_dim_aligned = false,
+                       int32_t fuse_dim = 0,
+                       const std::string& name = "reduce_mean_v");
+
 }  // namespace nuphar
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
index 7ffe9322bc2d5..29c5339f6ebef 100644
--- a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
+++ b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc
@@ -189,12 +189,9 @@ Status SubgraphPartitioner::Partition(
           bool unused_initializer = false;
           if (t != nullptr) {
             // note for Reshape and Tile, shape/repeats as initializer is not used at runtime
-            // scalar initializers in binary ops are not used at runtime either
-            static const std::unordered_set<std::string> binary_ops =
-                {"Add", "Div", "Sub", "Mul", "Pow", "Equal", "Greater", "Less"};
-
+            // neither for any scalar
             unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1) ||
-                                 (binary_ops.count(node.OpType()) > 0 && t->Shape().Size() == 1);
+                                 t->Shape().Size() == 1;
 
             if (!unused_initializer) {
               subgraph.initializers.emplace(def.Name(), t);

From a4d8a394c8305e5705381e7c8fc5ed4b73d96329 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 00:01:41 -0800
Subject: [PATCH 05/11] Missing reduce mean

---
 cmake/CMakeLists.txt                                           | 2 +-
 .../core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 52f0d60458ef9..4a8a4c3f7e189 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -188,7 +188,7 @@ if (MSVC)
   SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100")
   if (NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
-    SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
+    #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
   endif()
   check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
   if (HAS_QSPECTRE)
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
index c06961633f9e5..84868996cf8d9 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
@@ -41,7 +41,8 @@ namespace nuphar {
 #define LIST_REDUCE_V_OPS() \
   REDUCE_V_OP(ReduceMax)    \
   REDUCE_V_OP(ReduceMin)    \
-  REDUCE_V_OP(ReduceSum)
+  REDUCE_V_OP(ReduceSum)    \
+  REDUCE_V_OP(ReduceMean)
 
 #define LIST_ALL_X86_OPS()     \
   LIST_REDUCE_V_OPS()          \

From 2f3f06cae57890c654d6ce7dfb77092aaa719989 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 00:11:28 -0800
Subject: [PATCH 06/11] undo unintended change

---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 4a8a4c3f7e189..52f0d60458ef9 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -188,7 +188,7 @@ if (MSVC)
   SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100")
   if (NOT onnxruntime_USE_CUDA)
     SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
-    #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
+    SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
   endif()
   check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
   if (HAS_QSPECTRE)

From 8f9fad5289cabd2b48d0f26acf71ecea44816f8c Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 15:23:36 -0800
Subject: [PATCH 07/11] Treat constant initializers that has same value for all
 elements as scalar

---
 .../nuphar/common/nuphar_tvm_utils.cc         | 37 ++++++++++++++++++-
 .../nuphar/mti_x86/math/matmul_ops.cc         |  4 +-
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
index 7ea0407c73d5c..7f3e2dda0d0e4 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc
@@ -173,8 +173,41 @@ std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const
 bool TryCreateConstantScalar(
     tvm::Expr& scalar,
     const Tensor* tensor) {
-  if (!tensor || tensor->Shape().Size() > 1)
-    return false;  // return if not constant or not scalar
+  if (!tensor)
+    return false;
+
+  auto num_elements = tensor->Shape().Size();
+  if (num_elements > 1) {
+    // for non-scalar, only fold to constant scalar when all values are identical
+    const auto& dtype = tensor->DataType();
+    auto elem_size = dtype->Size();
+    const void* data = tensor->DataRaw();
+
+#define CHECK_ALL_TENSOR_SAME(T)                                                    \
+  for (int64_t i = 1; i < num_elements; ++i) {                                      \
+    if (reinterpret_cast<const T*>(data)[i] != reinterpret_cast<const T*>(data)[0]) \
+      return false;                                                                 \
+  }
+
+    switch (elem_size) {
+      case 1:
+        CHECK_ALL_TENSOR_SAME(int8_t);
+        break;
+      case 2:
+        CHECK_ALL_TENSOR_SAME(int16_t);
+        break;
+      case 4:
+        CHECK_ALL_TENSOR_SAME(int32_t);
+        break;
+      case 8:
+        CHECK_ALL_TENSOR_SAME(int64_t);
+        break;
+      default:
+        return false;
+    }
+
+#undef CHECK_ALL_TENSOR_SAME
+  }
 
 #define ASSIGN_TVM_SCALAR(tvm_type, tensor_type)                      \
   if (tensor->IsDataType<tensor_type>()) {                            \
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
index 141f70c572505..8fbbc60d72564 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc
@@ -157,9 +157,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.batched_matmul_cpu")
         int64_t N = B->shape[permute_B[B->ndim - 1]];
         bool trans_a = (permute_A[A->ndim - 2] == A->ndim - 1);
         bool trans_b = (permute_B[B->ndim - 2] == B->ndim - 1);
-        int64_t step_a = stride_A[permute_A[A->ndim - 3]];
+        int64_t step_a = num_matmuls > 1 ? stride_A[permute_A[A->ndim - 3]] : 0;
         int64_t lda = stride_A[permute_A[A->ndim - (trans_a ? 1 : 2)]];
-        int64_t step_b = stride_B[permute_B[B->ndim - 3]];
+        int64_t step_b = num_matmuls > 1 ? stride_B[permute_B[B->ndim - 3]] : 0;
         int64_t ldb = stride_B[permute_B[B->ndim - (trans_b ? 1 : 2)]];
 
         for (int i = 0; i < num_matmuls; i++) {

From f4345bf83b996dc67b35d417221acfcf58ca896b Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 16:24:34 -0800
Subject: [PATCH 08/11] Fix reduce test

---
 .../providers/nuphar/mti_x86/math/reduce_ops.cc    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
index 3e193517a71cf..df35a42d318f0 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@@ -314,10 +314,16 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X,
   tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims,
                                 X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name);
 
-  tvm::Expr count = tvm::make_const(HalideIR::Int(32), 1);
-  for (auto ax : axes) {
-    ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size());
-    count = count * X->shape[ax];
+  tvm::Expr count;
+  if (axes.size() > 0) {
+    count = tvm::make_const(HalideIR::Int(32), 1);
+    for (auto ax : axes) {
+      ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size());
+      count = count * X->shape[ax];
+    }
+  } else {
+    // by default, reduce over all axes
+    count = tvm_codegen::SizeFromDimension(X->shape, 0);
   }
   return topi::divide(sum, tvm::cast(X->dtype, count));
 }

From 3836a2a8b9686a5358e9a5b729e4450eb0cfd74c Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 17:03:56 -0800
Subject: [PATCH 09/11] Fix reduce mean tests

---
 .../core/providers/nuphar/mti_x86/math/reduce_ops.cc        | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
index df35a42d318f0..e816c77ed90ab 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@@ -311,9 +311,7 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X,
                        bool last_dim_aligned,
                        int32_t fuse_dim,
                        const std::string& name) {
-  tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims,
-                                X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name);
-
+  tvm::Tensor sum = ReduceSum(X, axes, keep_dims, vector_size, last_dim_aligned, fuse_dim, name + "_sum");
   tvm::Expr count;
   if (axes.size() > 0) {
     count = tvm::make_const(HalideIR::Int(32), 1);
@@ -325,7 +323,7 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X,
     // by default, reduce over all axes
     count = tvm_codegen::SizeFromDimension(X->shape, 0);
   }
-  return topi::divide(sum, tvm::cast(X->dtype, count));
+  return topi::divide(sum, tvm::cast(X->dtype, count), name + "_div");
 }
 
 // [WIP] a special vectorization friendly value reduction

From 5d8730f54b1388828f87ff771d203e306a9080d2 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Wed, 13 Nov 2019 22:42:15 -0800
Subject: [PATCH 10/11] Improve symbolic shape inference

---
 .../nuphar/scripts/symbolic_shape_infer.py         | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
index b59bc9a9411df..48becfede31c8 100644
--- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
@@ -48,6 +48,14 @@ def as_scalar(x):
     else:
         return x
 
+def as_list(x):
+    if type(x) == list:
+        return x
+    elif type(x) == np.ndarray:
+        return list(x)
+    else:
+        return [x]
+
 def sympy_reduce_product(x):
     if type(x) == list:
         value = sympy.Integer(1)
@@ -809,14 +817,16 @@ def _infer_Slice(self, node):
             ends = get_attribute(node, 'ends')
             steps = [1]*len(axes)
         else:
-            starts = self._try_get_value(node, 1)
-            ends = self._try_get_value(node, 2)
+            starts = as_list(self._try_get_value(node, 1))
+            ends = as_list(self._try_get_value(node, 2))
             axes = self._try_get_value(node, 3)
             steps = self._try_get_value(node, 4)
             if axes is None and not (starts is None and ends is None):
                 axes = list(range(0, len(starts if starts is not None else ends)))
             if steps is None and not (starts is None and ends is None):
                 steps = [1]*len(starts if starts is not None else ends)
+            axes = as_list(axes)
+            steps = as_list(steps)
 
         new_sympy_shape = self._get_sympy_shape(node, 0)
         if starts is None or ends is None:

From 3b1e5407e64494e6d4df5ce93a1072de75915943 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Thu, 14 Nov 2019 00:34:58 -0800
Subject: [PATCH 11/11] Minor updates for better debugging

---
 onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h | 4 ----
 onnxruntime/core/providers/nuphar/runtime/exec_block.cc    | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h b/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h
index 06e105150ad54..52f950a648461 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h
@@ -83,10 +83,6 @@ struct NupharSubgraphUnit {
     return nodes.size() == 1;
   }
 
-  const std::string& Name() const {
-    return nodes.front()->Name();
-  }
-
   std::string UniqueId() const {
     return std::to_string(id_);
   }
diff --git a/onnxruntime/core/providers/nuphar/runtime/exec_block.cc b/onnxruntime/core/providers/nuphar/runtime/exec_block.cc
index 26feec129758e..6aed253ab7d43 100644
--- a/onnxruntime/core/providers/nuphar/runtime/exec_block.cc
+++ b/onnxruntime/core/providers/nuphar/runtime/exec_block.cc
@@ -18,10 +18,10 @@ void CreateExecBlock(std::vector<std::unique_ptr<ExecBlock>>& exec_blocks,
                      bool /*enable_tiling*/) {
   if (subgraph.IsSingleNode() && subgraph.nodes.front()->OpType() == "Scan") {
     exec_blocks.push_back(
-        std::move(onnxruntime::make_unique<LoopExecBlock>(func_info, "nuphar_exec_" + subgraph.Name())));
+        std::move(onnxruntime::make_unique<LoopExecBlock>(func_info, "nuphar_exec_" + subgraph.UniqueId())));
   } else {
     exec_blocks.push_back(
-        std::move(onnxruntime::make_unique<BasicExecBlock>(func_info, "nuphar_exec_" + subgraph.Name())));
+        std::move(onnxruntime::make_unique<BasicExecBlock>(func_info, "nuphar_exec_" + subgraph.UniqueId())));
   }
 }