From 54a47b4aebee66ea1ed999cbfb35372b3254c94a Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Fri, 8 Nov 2019 18:29:57 -0800 Subject: [PATCH 01/11] Fuse transpose into MatMul Implement Pow and constant scalar simplification --- cmake/CMakeLists.txt | 2 +- .../core/codegen/mti/math/matmul_ops.cc | 36 ++- .../core/codegen/mti/math/matmul_ops.h | 4 +- .../passes/op_ir_creator/math/unary_funcs.h | 51 ++++ .../passes/op_ir_creator/math/unary_ops.cc | 45 +--- .../nuphar/compiler/codegen_manager.cc | 8 + .../compiler/x86/op_ir_creator/all_ops.h | 26 +- .../x86/op_ir_creator/math/binary_ops.cc | 105 ++++++++ .../compiler/x86/op_ir_creator/math/gemm.cc | 2 +- .../compiler/x86/op_ir_creator/math/matmul.cc | 107 ++++++-- .../x86/op_ir_creator/math/unary_ops.cc | 50 +--- onnxruntime/core/providers/nuphar/kernel.cc | 2 + onnxruntime/core/providers/nuphar/kernel.h | 205 +++++++-------- .../nuphar/mti_x86/math/halide_ops.cc | 2 +- .../nuphar/mti_x86/math/halide_ops.h | 2 + .../nuphar/mti_x86/math/matmul_ops.cc | 234 ++++++++++++------ .../nuphar/mti_x86/math/matmul_ops.h | 12 +- .../core/providers/nuphar/mti_x86/math/pow.cc | 42 ++++ .../core/providers/nuphar/mti_x86/math/pow.h | 16 ++ .../nuphar/partition/subgraph_partitioner.cc | 7 +- .../nuphar/scripts/symbolic_shape_infer.py | 12 +- 21 files changed, 655 insertions(+), 315 deletions(-) create mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h create mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/math/pow.h diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 52f0d60458ef9..4a8a4c3f7e189 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -188,7 +188,7 @@ if (MSVC) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100") if (NOT onnxruntime_USE_CUDA) SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL") - SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") + #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") endif() check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE) if (HAS_QSPECTRE) diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.cc b/onnxruntime/core/codegen/mti/math/matmul_ops.cc index 46f2fb75b6e24..1188cd874f7fe 100644 --- a/onnxruntime/core/codegen/mti/math/matmul_ops.cc +++ b/onnxruntime/core/codegen/mti/math/matmul_ops.cc @@ -124,22 +124,36 @@ tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string tvm::Array ComputeMatMulShape( const tvm::Array& A_shape, - const tvm::Array& B_shape) { + const tvm::Array& B_shape, + bool trans_a, + bool trans_b) { auto a_rank = A_shape.size(); auto b_rank = B_shape.size(); tvm::Array output_shape; int64_t output_rank = std::max(a_rank, b_rank); - MTI_ASSERT(tvm::ir::Equal(A_shape[a_rank - 1], B_shape[b_rank - 2])); - for (int64_t i = 0; i < output_rank - 2; i++) { - tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1); - bool broadcasted = - BroadcastDim(A_shape, i, output_rank, broadcasted_dim) && - BroadcastDim(B_shape, i, output_rank, broadcasted_dim); - MTI_ASSERT(broadcasted); - output_shape.push_back(broadcasted_dim); + MTI_ASSERT(a_rank > 0 && b_rank > 0); + if (a_rank == 1 && b_rank == 1) { + MTI_ASSERT(!trans_a && !trans_b); + // reduction, output shape is empty + } else if (a_rank == 1) { + MTI_ASSERT(!trans_a && !trans_b); + output_shape = SliceShapeToDimension(B_shape, b_rank - 2); + output_shape.push_back(B_shape[b_rank - 1]); + } else if (b_rank == 1) { + MTI_ASSERT(!trans_a && !trans_b); + output_shape = SliceShapeToDimension(A_shape, a_rank - 1); + } else { + for (int64_t i = 0; i < output_rank - 2; i++) { + tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1); + bool broadcasted = + BroadcastDim(A_shape, i, output_rank, broadcasted_dim) && + BroadcastDim(B_shape, i, output_rank, broadcasted_dim); + MTI_ASSERT(broadcasted); + output_shape.push_back(broadcasted_dim); + } + output_shape.push_back(A_shape[a_rank - (trans_a ? 1 : 2)]); + output_shape.push_back(B_shape[b_rank - (trans_b ? 2 : 1)]); } - output_shape.push_back(A_shape[a_rank - 2]); - output_shape.push_back(B_shape[b_rank - 1]); return output_shape; } diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.h b/onnxruntime/core/codegen/mti/math/matmul_ops.h index 7180b4f6d81e5..ab9986132d34a 100644 --- a/onnxruntime/core/codegen/mti/math/matmul_ops.h +++ b/onnxruntime/core/codegen/mti/math/matmul_ops.h @@ -11,7 +11,9 @@ namespace tvm_codegen { tvm::Array ComputeMatMulShape( const tvm::Array& A_shape, - const tvm::Array& B_shape); + const tvm::Array& B_shape, + bool trans_a = false, + bool trans_b = false); tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d"); diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h new file mode 100644 index 0000000000000..29e6519af0ef1 --- /dev/null +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/op_kernel_info.h" + +namespace onnxruntime { +namespace tvm_codegen { +// helper class for unary_ops with alpha +class FuncWithAlpha { + public: + FuncWithAlpha(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + } + + protected: + float alpha_; +}; + +// helper class for unary_ops with alpha and beta +class FuncWithAlphaBeta { + public: + FuncWithAlphaBeta(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + ORT_ENFORCE(attrs.GetAttr("beta", &beta_).IsOK()); + } + + protected: + float alpha_; + float beta_; +}; + +// helper class for unary_ops with alpha and gamma +class FuncWithAlphaGamma { + public: + FuncWithAlphaGamma(const Node& node) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper attrs(&ctx); + ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); + ORT_ENFORCE(attrs.GetAttr("gamma", &gamma_).IsOK()); + } + + protected: + float alpha_; + float gamma_; +}; +} // namespace tvm_codegen +} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc index bd5b89c718435..0407c0a06abf6 100644 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc +++ b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc @@ -5,54 +5,11 @@ #include "core/codegen/common/op_macro.h" #include "core/codegen/mti/math/unary_ops.h" -#include "core/framework/op_kernel_info.h" +#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h" namespace onnxruntime { namespace tvm_codegen { -// helper class for unary_ops with alpha -class FuncWithAlpha { - public: - FuncWithAlpha(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - } - - protected: - float alpha_; -}; - -// helper class for unary_ops with alpha and beta -class FuncWithAlphaBeta { - public: - FuncWithAlphaBeta(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("beta", &beta_).IsOK()); - } - - protected: - float alpha_; - float beta_; -}; - -// helper class for unary_ops with alpha and gamma -class FuncWithAlphaGamma { - public: - FuncWithAlphaGamma(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("gamma", &gamma_).IsOK()); - } - - protected: - float alpha_; - float gamma_; -}; - // helper macro declares unary_ops helper class without attribute #define FuncClass(name) \ class Func##name { \ diff --git a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc index 582ada8f3b944..3879bda9fe66e 100644 --- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc +++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc @@ -30,6 +30,8 @@ namespace nuphar { #define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_registry) { LIST_ALL_X86_OPS() @@ -39,6 +41,8 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re #undef POOL_OP #undef REDUCE_V_OP #undef UNARY_OP +#undef BINARY_OP +#undef BINARY_CMP_OP // END: NupharTVM X86 IR creator classes @@ -138,6 +142,8 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la #define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_OP(name) ADD_OP_ITEM(name) +#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) static void RegisterNupharX86Dispatcher(const std::shared_ptr& builder, const tvm_codegen::OpIRRegistry* registry) { @@ -150,6 +156,8 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr info(&ctx); + NupharCodeGenCtx* ctx_nuphar = Promote(&ctx_codegen); + + ORT_ENFORCE(i < node.InputDefs().size()); + const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name()); + + if (!tensor || tensor->Shape().Size() > 1) + return false; // return if not constant or not scalar + +#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type) \ + if (utils::IsPrimitiveDataType(tensor->DataType())) { \ + scalar = tvm::make_const(tvm_type, *tensor->Data()); \ + } + +#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \ + else ASSIGN_TVM_SCALAR(tvm_type, tensor_type) + + ASSIGN_TVM_SCALAR(HalideIR::Float(32), float) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double) + else { + return false; + } + +#undef ASSIGN_TVM_SCALAR + + return true; +} + +// helper local macro defines Evaluate of BINARY_OP OpIRCreators +#define BINARY_OP(name) \ + Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext& ctx_codegen, \ + tvm::Array& outputs) { \ + tvm::Expr scalar0, scalar1; \ + bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \ + bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \ + tvm::Tensor Y; \ + if (use_scalar0) \ + Y = name(scalar0, inputs[1], node.Name()); \ + else if (use_scalar1) \ + Y = name(inputs[0], scalar1, node.Name()); \ + else \ + Y = name(inputs[0], inputs[1], node.Name()); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_X86_BINARY_OPS() + +#undef BINARY_OP + +// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators +#define BINARY_CMP_OP(name) \ + Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext& ctx_codegen, \ + tvm::Array& outputs) { \ + tvm::Expr scalar0, scalar1; \ + bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \ + bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \ + tvm::Tensor Y; \ + if (use_scalar0) \ + Y = name(scalar0, inputs[1], node.Name()); \ + else if (use_scalar1) \ + Y = name(inputs[0], scalar1, node.Name()); \ + else \ + Y = name(inputs[0], inputs[1], node.Name()); \ + Y = Cast(Y, HalideIR::UInt(8), "cast_bool_" #name); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } + +LIST_X86_BINARY_CMP_OPS() + +#undef BINARY_CMP_OP + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc index 5ac2adf738017..a248af400edf9 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/gemm.cc @@ -33,7 +33,7 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Gemm)::Evaluate( // use native sgemm for floating point if (A->dtype == HalideIR::Float(32) && B->dtype == HalideIR::Float(32) && - MatMulExternCpu(A, B, Y, !!trans_a, !!trans_b, node.Name() + "_gemm")) { + GemmExternCpu(A, B, Y, !!trans_a, !!trans_b, node.Name() + "_gemm")) { if (beta != 0) { tvm::Tensor beta_bias = (beta == 1) ? C : tvm_codegen::Mul(tvm::make_const(tvm::Float(32), beta), C); Y = tvm_codegen::Add((alpha == 1) ? Y : tvm_codegen::Mul(tvm::make_const(tvm::Float(32), alpha), Y), beta_bias, node.Name() + "_add_bias"); diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc index e81ef497c50a8..f766fe98ad7c1 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc @@ -1,14 +1,14 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" - -#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h" -#include "core/providers/nuphar/mti_x86/math/matmul_ops.h" #include "core/codegen/mti/mti_tvm_utils.h" #include "core/codegen/passes/weight_layout/transpose_2d.h" #include "core/codegen/passes/weight_layout/vertical_stripes_2d.h" +#include "core/framework/op_kernel_info.h" +#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h" +#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" #include "core/providers/nuphar/compiler/x86/x86_target_info.h" +#include "core/providers/nuphar/mti_x86/math/matmul_ops.h" #include @@ -89,29 +89,86 @@ static bool MatMul_weights2D( return true; } -static bool MatMulF32ExternCpuEx( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - NupharCodeGenCtx& ctx_nuphar, - const tvm::Tensor& A, - const tvm::Tensor& B, +static bool MatMulF32ExternCPU( + tvm::Tensor A, + tvm::Tensor B, tvm::Tensor& Y, - const std::string& B_initializer_name = "", - bool trans_a = false, - bool trans_b = false, - const std::string& name = "matmul_extern_cpu_ex") { - // transpose weights if not already - tvm::Tensor actual_B = B; - - if (ctx_nuphar.IsInitializer(B_initializer_name) && !trans_b) { - auto layout_key = tvm_codegen::WeightLayoutTranspose2D::GetKey(proto_type); - actual_B = ctx_nuphar.ApplyWeightLayout(layout_key, B_initializer_name, B, true); - trans_b = true; + const Node& node, + tvm_codegen::CodeGenContext& ctx_codegen) { + NupharCodeGenCtx* ctx_nuphar = Promote(&ctx_codegen); + + // try to fuse tranpose in MatMul input with MatMul + auto find_transposed_input = [&ctx_nuphar](const tvm::Tensor& t, std::vector& cumulated_permute) { + tvm::Tensor out = t; + int64_t rank = gsl::narrow(t->shape.size()); + std::vector default_node_perm(rank); + cumulated_permute.resize(rank); + for (int64_t i = 0; i < rank; ++i) { + cumulated_permute[i] = gsl::narrow(i); + default_node_perm[i] = rank - i - 1; + } + for (const Node* root_node = ctx_nuphar->FindNode(out); + root_node != nullptr && root_node->OpType() == "Transpose"; + root_node = ctx_nuphar->FindNode(out)) { + ProtoHelperNodeContext ctx(*root_node); + OpNodeProtoHelper info(&ctx); + auto perm = info.GetAttrsOrDefault("perm", default_node_perm); + std::vector updated_cumulated_permute = cumulated_permute; + for (int64_t dst_dim = 0; dst_dim < rank; ++dst_dim) { + auto src_dim = tvm_codegen::HandleNegativeAxis(perm[cumulated_permute[dst_dim]], rank); + updated_cumulated_permute[dst_dim] = gsl::narrow(src_dim); + } + cumulated_permute = updated_cumulated_permute; + // op corresponding to node should be Transpose + auto op = out->op.as(); + ORT_ENFORCE(op != nullptr); + ORT_ENFORCE(op->InputTensors().size() == 1); + out = op->InputTensors()[0]; + } + return out; + }; + + std::vector permute_A; + std::vector permute_B; + const std::vector* p_permute_A = nullptr; + const std::vector* p_permute_B = nullptr; + tvm::Tensor root_A = find_transposed_input(A, permute_A); + tvm::Tensor root_B = find_transposed_input(B, permute_B); + if (A->shape.size() == B->shape.size() && A->shape.size() > 2) { + // currently only fuse Transpose into MatMul when rank(A) == rank(B) + // make sure no broadcasting in MatMul + bool no_broadcast = true; + for (size_t i = 0; i < A->shape.size() - 2; ++i) { + if (!tvm::ir::Equal(A->shape[i], B->shape[i])) { + no_broadcast = false; + break; + } + } + if (no_broadcast) { + if (CanPermuteBeFusedInMatMul(permute_A)) { + A = root_A; + p_permute_A = &permute_A; + } + if (CanPermuteBeFusedInMatMul(permute_B)) { + B = root_B; + p_permute_B = &permute_B; + } + } } - return nuphar::MatMulExternCpu(A, actual_B, Y, trans_a, trans_b, name); + const auto& B_name = node.InputDefs()[1]->Name(); + if (ctx_nuphar->IsInitializer(B_name) && B->shape.size() == 2) { + // matmul with initializer, using transpose weights + auto layout_key = tvm_codegen::WeightLayoutTranspose2D::GetKey(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + auto actual_B = ctx_nuphar->ApplyWeightLayout(layout_key, B_name, B, true); + return nuphar::GemmExternCpu(A, actual_B, Y, false, true, B_name); + } else { + return nuphar::MatMulExternCpu(A, B, Y, p_permute_A, p_permute_B, node.Name() + "_matmul_extern"); + } } -Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate( +Status +NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate( const tvm::Array& inputs, const Node& node, tvm_codegen::CodeGenContext& ctx_codegen, @@ -123,15 +180,17 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(MatMul)::Evaluate( tvm::Tensor Y; auto& A = inputs[0]; auto& B = inputs[1]; - const std::string& input_1_name = node.InputDefs()[1]->Name(); + // float MatMul, try use extern if (A->dtype == HalideIR::Float(32) && B->dtype == HalideIR::Float(32) && - MatMulF32ExternCpuEx(proto_type, *ctx_nuphar, A, B, Y, input_1_name)) { + MatMulF32ExternCPU(A, B, Y, node, ctx_codegen)) { outputs.push_back(Y); return Status::OK(); } + // if B is 2D initializer, use vertical stripe layout + const std::string& input_1_name = node.InputDefs()[1]->Name(); if (ShapeRank(node.InputDefs()[1]) == 2 && ctx_nuphar->IsInitializer(input_1_name)) { if (MatMul_weights2D(proto_type, A, B, input_1_name, *ctx_nuphar, Y)) { outputs.push_back(Y); diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc index ec9a22af84576..30fec1dc24c63 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc @@ -3,55 +3,13 @@ #include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" +#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h" #include "core/framework/op_kernel_info.h" #include "core/providers/nuphar/mti_x86/math/unary_ops.h" namespace onnxruntime { namespace nuphar { -// helper class for unary_ops with alpha -class FuncWithAlpha { - public: - FuncWithAlpha(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - } - - protected: - float alpha_; -}; - -// helper class for unary_ops with alpha and beta -class FuncWithAlphaBeta { - public: - FuncWithAlphaBeta(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("beta", &beta_).IsOK()); - } - - protected: - float alpha_; - float beta_; -}; - -// helper class for unary_ops with alpha and gamma -class FuncWithAlphaGamma { - public: - FuncWithAlphaGamma(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("gamma", &gamma_).IsOK()); - } - - protected: - float alpha_; - float gamma_; -}; - // helper macro declares unary_ops helper class without attribute #define FuncClass(name) \ class Func##name { \ @@ -64,7 +22,7 @@ class FuncWithAlphaGamma { // helper macro declares unary_ops helper class with alpha #define FuncClassAlpha(name) \ - class Func##name : public FuncWithAlpha { \ + class Func##name : public tvm_codegen::FuncWithAlpha { \ public: \ Func##name(const Node& node) : FuncWithAlpha(node) {} \ tvm::Tensor operator()(const tvm::Tensor& X) const { \ @@ -74,7 +32,7 @@ class FuncWithAlphaGamma { // helper macro declares unary_ops helper class with alpha and beta #define FuncClassAlphaBeta(name) \ - class Func##name : public FuncWithAlphaBeta { \ + class Func##name : public tvm_codegen::FuncWithAlphaBeta { \ public: \ Func##name(const Node& node) : FuncWithAlphaBeta(node) {} \ tvm::Tensor operator()(const tvm::Tensor& X) const { \ @@ -84,7 +42,7 @@ class FuncWithAlphaGamma { // helper macro declares unary_ops helper class with alpha and gamma #define FuncClassAlphaGamma(name) \ - class Func##name : public FuncWithAlphaGamma { \ + class Func##name : public tvm_codegen::FuncWithAlphaGamma { \ public: \ Func##name(const Node& node) : FuncWithAlphaGamma(node) {} \ tvm::Tensor operator()(const tvm::Tensor& X) const { \ diff --git a/onnxruntime/core/providers/nuphar/kernel.cc b/onnxruntime/core/providers/nuphar/kernel.cc index dbecb12d1a458..459894105e711 100644 --- a/onnxruntime/core/providers/nuphar/kernel.cc +++ b/onnxruntime/core/providers/nuphar/kernel.cc @@ -4,6 +4,7 @@ #include "core/providers/nuphar/kernel.h" #include "core/codegen/passes/utils/codegen_context.h" +#include "core/codegen/common/profile.h" #include "core/framework/tensorprotoutils.h" #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h" #include "core/providers/nuphar/compiler/initializer_info.h" @@ -117,6 +118,7 @@ Status NupharKernelState::Compute(OpKernelContext* op_kernel_context) const { compute_ctx->Bind(op_kernel_context); for (auto* call : exec_block_calls_) { + CODEGEN_PROFILER_EVENT(call->Name()); call->Run(compute_ctx); } diff --git a/onnxruntime/core/providers/nuphar/kernel.h b/onnxruntime/core/providers/nuphar/kernel.h index d308e9dae736e..c4f309ad77953 100644 --- a/onnxruntime/core/providers/nuphar/kernel.h +++ b/onnxruntime/core/providers/nuphar/kernel.h @@ -72,108 +72,109 @@ class NupharKernelState { #define DISABLE_MACRO(X) -#define LIST_NUPHAR_OPS() \ - NUPHAR_OP(Abs, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ArgMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ArgMax, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ArgMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ArgMin, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(AveragePool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Concat, 4, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Concat, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - DISABLE_MACRO(NUPHAR_OP(Conv, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())) \ - NUPHAR_OP(Crop, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Div, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Dropout, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Elu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Equal, 7, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Equal, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Erf, 9, DataTypeImpl::GetTensorType()) \ - NUPHAR_OP(Exp, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Expand, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(Flatten, 1, 8, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Flatten, 9, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Flatten, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(LeakyRelu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Less, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Log, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(LogSoftmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(LogSoftmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - DISABLE_MACRO(NUPHAR_OP(LSTM, 7, DataTypeImpl::AllIEEEFloatTensorTypes())) \ - NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(MaxPool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Pad, 2, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(ParametricSoftplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(PRelu, 7, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Relu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Reciprocal, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceL1, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceL1, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceL2, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(ReduceL2, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceLogSum, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(ReduceLogSum, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceLogSumExp, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(ReduceLogSumExp, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceMax, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceMean, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceMean, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceMin, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceProd, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceProd, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceSum, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceSum, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(ReduceSumSquare, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ReduceSumSquare, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Reshape, 5, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ScaledTanh, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Selu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Shape, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Sigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Slice, 1, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Slice, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Slice, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(Softmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Softmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Softplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Softsign, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Split, 2, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Split, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_VERSIONED_OP(Squeeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Squeeze, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Sqrt, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ - NUPHAR_OP(Sub, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Sum, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Tanh, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(ThresholdedRelu, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Tile, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Transpose, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_VERSIONED_OP(Unsqueeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ - NUPHAR_OP(Unsqueeze, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ +#define LIST_NUPHAR_OPS() \ + NUPHAR_OP(Abs, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ArgMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ArgMax, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ArgMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ArgMin, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(AveragePool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Concat, 4, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Concat, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + DISABLE_MACRO(NUPHAR_OP(Conv, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())) \ + NUPHAR_OP(Crop, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Div, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Dropout, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Elu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Equal, 7, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Equal, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Erf, 9, DataTypeImpl::GetTensorType()) \ + NUPHAR_OP(Exp, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Expand, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(Flatten, 1, 8, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Flatten, 9, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Flatten, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(LeakyRelu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Less, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Log, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(LogSoftmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(LogSoftmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + DISABLE_MACRO(NUPHAR_OP(LSTM, 7, DataTypeImpl::AllIEEEFloatTensorTypes())) \ + NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(MaxPool, 11, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Pad, 2, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(ParametricSoftplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Pow, 7, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(PRelu, 7, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Relu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Reciprocal, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceL1, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceL1, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceL2, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(ReduceL2, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceLogSum, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(ReduceLogSum, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceLogSumExp, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(ReduceLogSumExp, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceMax, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceMax, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceMean, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceMean, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceMin, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceMin, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceProd, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceProd, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceSum, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceSum, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(ReduceSumSquare, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ReduceSumSquare, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Reshape, 5, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ScaledTanh, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Selu, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Shape, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Sigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Slice, 1, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Slice, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Slice, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(Softmax, 1, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Softmax, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Softplus, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Softsign, 1, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Split, 2, 10, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Split, 11, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_VERSIONED_OP(Squeeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Squeeze, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Sqrt, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ + NUPHAR_OP(Sub, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Sum, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Tanh, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(ThresholdedRelu, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Tile, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Transpose, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_VERSIONED_OP(Unsqueeze, 1, 10, DataTypeImpl::AllFixedSizeTensorTypes()) \ + NUPHAR_OP(Unsqueeze, 11, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(Where, 9, DataTypeImpl::AllFixedSizeTensorTypes()) } // namespace nuphar diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc index d18c6292495de..eb9a78ffea520 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.cc @@ -174,7 +174,7 @@ tvm::Expr raise_to_integer_power(const tvm::Expr& e, int64_t p) { * cast to Float(32). For Float(32), cleanly vectorizable, and * accurate up to the last few bits of the mantissa. Gets worse when * approaching overflow. Vectorizes cleanly. */ -inline tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y) { +tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y) { if (const int64_t* i = as_const_int(y)) { return raise_to_integer_power(x, *i); } diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h index e421bd5071715..80ed407175d4f 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/halide_ops.h @@ -43,6 +43,8 @@ tvm::Expr halideir_exp(const tvm::Expr& x_full); tvm::Expr halideir_log(const tvm::Expr& x_full); +tvm::Expr halideir_pow(tvm::Expr x, tvm::Expr y); + tvm::Expr fast_log(const tvm::Expr& x); } // namespace nuphar diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc index 1c30b29687843..141f70c572505 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc @@ -19,7 +19,7 @@ namespace nuphar { tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a, bool trans_b, const std::string& name) { tvm::Tensor Y; - if (MatMulExternCpu(A, B, Y, trans_a, trans_b)) + if (GemmExternCpu(A, B, Y, trans_a, trans_b)) return Y; return topi::matmul(A, B, trans_a, trans_b, name); @@ -110,31 +110,79 @@ TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.batched_matmul_cpu") DCHECK(tvm::runtime::TypeMatch(B->dtype, kDLFloat, 32)); DCHECK(tvm::runtime::TypeMatch(C->dtype, kDLFloat, 32)); - MatMulComputeHelper helper; - TensorShape A_shape(A->shape, A->ndim); - TensorShape B_shape(B->shape, B->ndim); - helper.Compute(A_shape, B_shape); - - size_t max_len = helper.OutputOffsets().size(); - for (size_t i = 0; i < max_len; i++) { - math::MatMul( - static_cast(helper.M()), - static_cast(helper.N()), - static_cast(helper.K()), - (float*)A->data + helper.LeftOffsets()[i], - (float*)B->data + helper.RightOffsets()[i], - (float*)C->data + helper.OutputOffsets()[i], - nullptr); // TODO: use thread pool from OpContext + if (args.num_args == 3) { + MatMulComputeHelper helper; + TensorShape A_shape(A->shape, A->ndim); + TensorShape B_shape(B->shape, B->ndim); + helper.Compute(A_shape, B_shape); + + size_t max_len = helper.OutputOffsets().size(); + for (size_t i = 0; i < max_len; i++) { + math::MatMul( + static_cast(helper.M()), + static_cast(helper.N()), + static_cast(helper.K()), + (float*)A->data + helper.LeftOffsets()[i], + (float*)B->data + helper.RightOffsets()[i], + (float*)C->data + helper.OutputOffsets()[i], + nullptr); // TODO: use thread pool from OpContext + } + } else { + // matmul fused with transpose, modify lda/ldb and step_a/step_b for the zero-cost transpose + DCHECK(A->ndim == B->ndim); + DCHECK(args.num_args - 3 == A->ndim + B->ndim); + std::vector permute_A(A->ndim); + std::vector stride_A(A->ndim); + std::vector permute_B(B->ndim); + std::vector stride_B(B->ndim); + int arg_idx = 3; + int num_matmuls = 1; + for (int i = 0; i < A->ndim; ++i) { + permute_A[i] = tvm::runtime::TVMArgValue(args.values[arg_idx + i], args.type_codes[arg_idx + i]); + if (i < A->ndim - 2) { + num_matmuls *= A->shape[permute_A[i]]; + } + stride_A[A->ndim - 1 - i] = (i == 0) ? 1 : stride_A[A->ndim - i] * A->shape[A->ndim - i]; + } + arg_idx += A->ndim; + for (int i = 0; i < B->ndim; ++i) { + permute_B[i] = tvm::runtime::TVMArgValue(args.values[arg_idx + i], args.type_codes[arg_idx + i]); + stride_B[B->ndim - 1 - i] = (i == 0) ? 1 : stride_B[B->ndim - i] * B->shape[B->ndim - i]; + } + + float alpha = 1.0f; + float beta = 0.0f; + int64_t M = A->shape[permute_A[A->ndim - 2]]; + int64_t K = A->shape[permute_A[A->ndim - 1]]; + int64_t N = B->shape[permute_B[B->ndim - 1]]; + bool trans_a = (permute_A[A->ndim - 2] == A->ndim - 1); + bool trans_b = (permute_B[B->ndim - 2] == B->ndim - 1); + int64_t step_a = stride_A[permute_A[A->ndim - 3]]; + int64_t lda = stride_A[permute_A[A->ndim - (trans_a ? 1 : 2)]]; + int64_t step_b = stride_B[permute_B[B->ndim - 3]]; + int64_t ldb = stride_B[permute_B[B->ndim - (trans_b ? 1 : 2)]]; + + for (int i = 0; i < num_matmuls; i++) { + math::GemmEx( + trans_a ? CblasTrans : CblasNoTrans, + trans_b ? CblasTrans : CblasNoTrans, + M, + N, + K, + alpha, + (float*)A->data + i * step_a, + lda, + (float*)B->data + i * step_b, + ldb, + beta, + (float*)C->data + i * M * N, + N, + nullptr); // TODO: use thread pool from OpContext + } } }); -bool MatMulExternCpu( - const tvm::Tensor& A, - const tvm::Tensor& B, - tvm::Tensor& Y, - bool trans_a, - bool trans_b, - const std::string& name) { +static bool ShouldUseMatMulExtern() { // Note: currently default behavior is always prefer extern const codegen::CodeGenSettings& settings = codegen::CodeGenSettings::Instance(); if (settings.HasOption(kNupharMatmulExec)) { @@ -144,6 +192,30 @@ bool MatMulExternCpu( if (!prefer_extern) return false; } + return true; +} + +bool CanPermuteBeFusedInMatMul(const std::vector& perm) { + auto rank = gsl::narrow(perm.size()); + if (rank < 2) return true; + + // only fusable if inner-most dim could be transposed + return (perm[rank - 1] == rank - 1) || + (perm[rank - 2] == rank - 1); +}; + +bool GemmExternCpu( + const tvm::Tensor& A, + const tvm::Tensor& B, + tvm::Tensor& Y, + bool trans_a, + bool trans_b, + const std::string& name) { + if (!ShouldUseMatMulExtern()) + return false; + + if (A->shape.size() == 1 && B->shape.size() == 1) + return false; // TVM extern cannot have output shape being empty // TODO: add support for mixed precisions if (A->dtype != B->dtype || @@ -151,69 +223,85 @@ bool MatMulExternCpu( A->dtype.bits() != 32) return false; - // inputs need to be at least 1D - auto rank_A = A->shape.size(); - auto rank_B = B->shape.size(); - if (rank_A < 1 || rank_B < 1) + tvm::Array out_shape = tvm_codegen::ComputeMatMulShape(A->shape, B->shape, trans_a, trans_b); + + Y = topi::detail::make_extern( + {out_shape}, {A->dtype}, {A, B}, + [&](tvm::Array ins, tvm::Array outs) { + return topi::detail::call_packed( + {tvm::Expr("tvm.contrib.onnxruntime.sgemm_cpu"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(ins[1]), + topi::detail::pack_buffer(outs[0]), + trans_a, + trans_b}); + }, + name, "", {})[0]; + + return true; +} + +bool MatMulExternCpu( + const tvm::Tensor& A, + const tvm::Tensor& B, + tvm::Tensor& Y, + const std::vector* permute_A, + const std::vector* permute_B, + const std::string& name) { + if (permute_A != nullptr) { + ORT_ENFORCE(permute_B != nullptr); + ORT_ENFORCE(CanPermuteBeFusedInMatMul(*permute_A)); + ORT_ENFORCE(CanPermuteBeFusedInMatMul(*permute_B)); + ORT_ENFORCE(permute_A->size() == permute_B->size()); + ORT_ENFORCE(permute_A->size() == A->shape.size()); + ORT_ENFORCE(permute_B->size() == B->shape.size()); + } + + // TODO: add support for mixed precisions + if (A->dtype != B->dtype || + !A->dtype.is_float() || + A->dtype.bits() != 32) return false; - // only allow trans_a for 2D inputs - if (rank_A != 2 && trans_a) + // inputs need to be at least 1D + auto rank_A = gsl::narrow(A->shape.size()); + auto rank_B = gsl::narrow(B->shape.size()); + + if (rank_A < 1 || rank_B < 1) return false; // do not support 1-D x 1-D as tvm extern require buffer size > 0 if (rank_A == 1 && rank_B == 1) return false; - tvm::Array out_shape; - if (rank_A == 1) { - // 1-D x N-D - if (trans_b) { - ORT_ENFORCE(rank_B == 2); - out_shape.push_back(B->shape[0]); - } else { - for (size_t d = 0; d < rank_B - 2; ++d) - out_shape.push_back(B->shape[d]); - out_shape.push_back(B->shape[rank_B - 1]); - } - } else if (rank_B == 1) { - // N-D x 1-D - for (size_t d = 0; d < rank_A - 1; ++d) - out_shape.push_back(A->shape[d]); - } else { - // N-D x N-D - if (rank_B == 2) { - if (trans_a) { - // trans_a is only allowed for 2D - out_shape.push_back(A->shape[rank_A - 1]); - } else { - for (size_t d = 0; d < rank_A - 1; ++d) - out_shape.push_back(A->shape[d]); - } - out_shape.push_back(B->shape[trans_b ? rank_B - 2 : rank_B - 1]); - } else { - ORT_ENFORCE(!trans_a && !trans_b); - // batched matmul - out_shape = tvm_codegen::ComputeMatMulShape(A->shape, B->shape); - } + tvm::Array matmul_A_shape, matmul_B_shape; + for (int32_t d = 0; d < rank_A; ++d) { + matmul_A_shape.push_back(A->shape[permute_A != nullptr ? permute_A->at(d) : d]); + } + for (int32_t d = 0; d < rank_B; ++d) { + matmul_B_shape.push_back(B->shape[permute_B != nullptr ? permute_B->at(d) : d]); } + tvm::Array out_shape; + out_shape = tvm_codegen::ComputeMatMulShape(matmul_A_shape, matmul_B_shape); + Y = topi::detail::make_extern( {out_shape}, {A->dtype}, {A, B}, [&](tvm::Array ins, tvm::Array outs) { - if (rank_B <= 2) { - return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.sgemm_cpu"), - topi::detail::pack_buffer(ins[0]), - topi::detail::pack_buffer(ins[1]), - topi::detail::pack_buffer(outs[0]), - trans_a, - trans_b}); - } else { - return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.batched_matmul_cpu"), - topi::detail::pack_buffer(ins[0]), - topi::detail::pack_buffer(ins[1]), - topi::detail::pack_buffer(outs[0])}); + tvm::Array extern_args = { + tvm::Expr("tvm.contrib.onnxruntime.batched_matmul_cpu"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(ins[1]), + topi::detail::pack_buffer(outs[0])}; + if (permute_A != nullptr && permute_B != nullptr) { + for (const auto& perm_A : *permute_A) { + extern_args.push_back(perm_A); + } + for (const auto& perm_B : *permute_B) { + extern_args.push_back(perm_B); + } } + return topi::detail::call_packed(extern_args); }, name, "", {})[0]; @@ -222,7 +310,7 @@ bool MatMulExternCpu( tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name) { tvm::Tensor Y; - if (MatMulExternCpu(A, B, Y)) + if (GemmExternCpu(A, B, Y)) return Y; // go through generic case otherwise return tvm_codegen::MatMul(A, B, name); diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h index 53484dc7f5605..97c76cbb140fc 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.h @@ -10,7 +10,7 @@ namespace nuphar { tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d"); -bool MatMulExternCpu( +bool GemmExternCpu( const tvm::Tensor& A, const tvm::Tensor& B, tvm::Tensor& Y, @@ -18,6 +18,16 @@ bool MatMulExternCpu( bool trans_b = false, const std::string& name = "matmul_extern_cpu"); +bool MatMulExternCpu( + const tvm::Tensor& A, + const tvm::Tensor& B, + tvm::Tensor& Y, + const std::vector* permute_A, + const std::vector* permute_B, + const std::string& name = "matmul_permute_extern_cpu"); + +bool CanPermuteBeFusedInMatMul(const std::vector& perm); + tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name); } // namespace nuphar diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc new file mode 100644 index 0000000000000..006d5f0a98035 --- /dev/null +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.cc @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/nuphar/mti_x86/math/halide_ops.h" +#include "topi/broadcast.h" +#include "tvm/ir.h" + +namespace onnxruntime { +namespace nuphar { + +tvm::Tensor Pow(tvm::Tensor A, tvm::Tensor B, const std::string& name = "pow") { + return topi::power(A, B); +} + +tvm::Tensor Pow(tvm::Tensor A, tvm::Expr B, const std::string& name = "pow") { + // special case for integer pow passed in + const tvm::ir::FloatImm* op = B.as(); + if (op != nullptr) { + int64_t i = (int64_t)(op->value); + if ((double)i == op->value) { + B = tvm::make_const(HalideIR::Int(64), i); // replace B with integer for halideir_pow + } + } + return tvm::compute( + A->shape, + [&](const tvm::Array& indices) { + return halideir_pow(A(indices), B); + }, + name); +} + +tvm::Tensor Pow(tvm::Expr A, tvm::Tensor B, const std::string& name = "pow") { + return tvm::compute( + B->shape, + [&](const tvm::Array& indices) { + return halideir_pow(A, B(indices)); + }, + name); +} + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h new file mode 100644 index 0000000000000..339a75cad8464 --- /dev/null +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/pow.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +namespace onnxruntime { +namespace nuphar { + +tvm::Tensor Pow(tvm::Tensor A, tvm::Tensor B, const std::string& name = "pow"); +tvm::Tensor Pow(tvm::Expr A, tvm::Tensor B, const std::string& name = "pow"); +tvm::Tensor Pow(tvm::Tensor A, tvm::Expr B, const std::string& name = "pow"); + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc index 2632434cfdeb6..7ffe9322bc2d5 100644 --- a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc +++ b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc @@ -189,7 +189,12 @@ Status SubgraphPartitioner::Partition( bool unused_initializer = false; if (t != nullptr) { // note for Reshape and Tile, shape/repeats as initializer is not used at runtime - unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1); + // scalar initializers in binary ops are not used at runtime either + static const std::unordered_set binary_ops = + {"Add", "Div", "Sub", "Mul", "Pow", "Equal", "Greater", "Less"}; + + unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1) || + (binary_ops.count(node.OpType()) > 0 && t->Shape().Size() == 1); if (!unused_initializer) { subgraph.initializers.emplace(def.Name(), t); diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py index 0f900b595bec5..b59bc9a9411df 100644 --- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py +++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py @@ -109,7 +109,7 @@ def __init__(self, int_max, auto_merge, verbose): self.verbose_ = verbose self.int_max_ = int_max - def _add_suggested_merge(self, symbols): + def _add_suggested_merge(self, symbols, apply=False): assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols]) symbols = set(symbols) for k,v in self.suggested_merge_.items(): @@ -142,11 +142,13 @@ def _add_suggested_merge(self, symbols): for k,v in self.suggested_merge_.items(): if v == s: self.suggested_merge_[k] = map_to + if apply and self.auto_merge_: + self._apply_suggested_merge() - def _apply_suggested_merge_to_graph_input(self): + def _apply_suggested_merge(self, graph_input_only=False): if not self.suggested_merge_: return - for i in self.out_mp_.graph.input: + for i in list(self.out_mp_.graph.input) + ([] if graph_input_only else list(self.out_mp_.graph.value_info)): for d in i.type.tensor_type.shape.dim: if d.dim_param in self.suggested_merge_: v = self.suggested_merge_[d.dim_param] @@ -478,7 +480,7 @@ def _compute_matmul_shape(self, node, output_dtype=None): # record inconsistent reduce dim as suggested merge if lhs_shape[lhs_reduce_dim] != rhs_shape[rhs_reduce_dim]: merge_dims = [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]] - self._add_suggested_merge(merge_dims) + self._add_suggested_merge(merge_dims, apply=True) if output_dtype is None: # infer output_dtype from input type when not specified output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type @@ -954,7 +956,7 @@ def _infer_ZipMap(self, node): def _infer_impl(self, in_mp): self.sympy_data_ = {} self.out_mp_.graph.ClearField('value_info') - self._apply_suggested_merge_to_graph_input() + self._apply_suggested_merge(graph_input_only=True) input_symbols = set() for i in self.out_mp_.graph.input: input_symbols.update([d for d in get_shape_from_type_proto(i.type) if type(d) == str]) From 6b6f61eb1e81d8f7a6f9a47984788869a2d3736b Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Tue, 12 Nov 2019 12:01:28 -0800 Subject: [PATCH 02/11] Remove some unnecessary changes --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 4a8a4c3f7e189..52f0d60458ef9 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -188,7 +188,7 @@ if (MSVC) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100") if (NOT onnxruntime_USE_CUDA) SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL") - #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") + SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") endif() check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE) if (HAS_QSPECTRE) From d699e29f062d722afe5e60b4f3dc1c22c861de70 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Tue, 12 Nov 2019 16:32:25 -0800 Subject: [PATCH 03/11] Address CR and update test --- .../x86/op_ir_creator/math/binary_ops.cc | 2 +- .../python/onnxruntime_test_python_nuphar.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc index 2e5ed9b2551de..9af9a37326baf 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc @@ -28,7 +28,7 @@ bool HandleConstantScalar(tvm::Expr& scalar, size_t i, const Node& node, CodeGen return false; // return if not constant or not scalar #define ASSIGN_TVM_SCALAR(tvm_type, tensor_type) \ - if (utils::IsPrimitiveDataType(tensor->DataType())) { \ + if (tensor->IsDataType()) { \ scalar = tvm::make_const(tvm_type, *tensor->Data()); \ } diff --git a/onnxruntime/test/python/onnxruntime_test_python_nuphar.py b/onnxruntime/test/python/onnxruntime_test_python_nuphar.py index 46415e5399962..a72050d70f80c 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_nuphar.py +++ b/onnxruntime/test/python/onnxruntime_test_python_nuphar.py @@ -90,6 +90,34 @@ def test_bidaf(self): sess.run([], feed) + def test_bert_squad(self): + # download BERT_squad model + cwd = os.getcwd() + bert_squad_url = 'https://onnxzoo.blob.core.windows.net/models/opset_10/bert_squad/download_sample_10.tar.gz' + cache_dir = os.path.join(os.path.expanduser("~"), '.cache','onnxruntime') + os.makedirs(cache_dir, exist_ok=True) + bert_squad_local = os.path.join(cache_dir, 'bert_squad.tar.gz') + if not os.path.exists(bert_squad_local): + urllib.request.urlretrieve(bert_squad_url, bert_squad_local) + with tarfile.open(bert_squad_local, 'r') as f: + f.extractall(cwd) + + # run symbolic shape inference on this model + # set int_max to 1,000,000 to simplify symbol computes for things like min(1000000, seq_len) -> seq_len + bert_squad_dir = os.path.join(cwd, 'download_sample_10') + bert_squad_model = os.path.join(bert_squad_dir, 'bertsquad10.onnx') + subprocess.run([sys.executable, '-m', 'onnxruntime.nuphar.symbolic_shape_infer', '--input', bert_squad_model, '--output', bert_squad_model, '--auto_merge', '--int_max=1000000'], check=True, cwd=cwd) + + # run onnx_test_runner to verify results + onnx_test_runner = os.path.join(cwd, 'onnx_test_runner') + subprocess.run([onnx_test_runner, '-e', 'nuphar', '-n', 'download_sample_10', cwd], check=True, cwd=cwd) + + # run onnxruntime_perf_test + onnx_test_runner = os.path.join(cwd, 'onnxruntime_perf_test') + subprocess.run([onnx_test_runner, '-e', 'nuphar', '-t', '20', bert_squad_model, '1.txt'], check=True, cwd=cwd) + subprocess.run([onnx_test_runner, '-e', 'cpu', '-o', '99', '-t', '20', bert_squad_model, '1.txt'], check=True, cwd=cwd) + + def test_rnn_benchmark(self): # make sure benchmarking scripts works # note: quantized model requires AVX2, otherwise it might be slow @@ -106,5 +134,6 @@ def test_rnn_benchmark(self): layers=3, seq_len=16, batch_size=2, min_duration_seconds=1) + if __name__ == '__main__': unittest.main() From e64e4b632deb154ae3376b82abcf6afd659e24f3 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Tue, 12 Nov 2019 23:26:09 -0800 Subject: [PATCH 04/11] Vectorize ReduceMean A better scalar initializer handling --- .../nuphar/common/nuphar_tvm_utils.cc | 33 ++++++ .../nuphar/common/nuphar_tvm_utils.h | 6 +- .../nuphar/compiler/codegen_manager.cc | 8 -- .../nuphar/compiler/nuphar_codegen_ctx.h | 11 ++ .../nuphar/compiler/nuphar_op_ir_builder.cc | 34 ++++++ .../compiler/nuphar_schedule_builder.cc | 2 + .../compiler/x86/op_ir_creator/all_ops.h | 19 +--- .../x86/op_ir_creator/math/binary_ops.cc | 105 ------------------ .../compiler/x86/op_ir_creator/math/matmul.cc | 2 +- .../compiler/x86/op_ir_creator/math/pow.cc | 46 ++++++++ .../nuphar/mti_x86/math/reduce_ops.cc | 17 +++ .../nuphar/mti_x86/math/reduce_ops.h | 7 ++ .../nuphar/partition/subgraph_partitioner.cc | 7 +- 13 files changed, 159 insertions(+), 138 deletions(-) delete mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc create mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc index 90d3c6018dca1..7ea0407c73d5c 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc +++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc @@ -170,5 +170,38 @@ std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const return NormalizeCppName("_" + subgraph.UniqueId() + " " + codegen_target.GetTargetName()); } +bool TryCreateConstantScalar( + tvm::Expr& scalar, + const Tensor* tensor) { + if (!tensor || tensor->Shape().Size() > 1) + return false; // return if not constant or not scalar + +#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type) \ + if (tensor->IsDataType()) { \ + scalar = tvm::make_const(tvm_type, *tensor->Data()); \ + } + +#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \ + else ASSIGN_TVM_SCALAR(tvm_type, tensor_type) + + ASSIGN_TVM_SCALAR(HalideIR::Float(32), float) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(16), int16_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(8), int8_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(16), uint16_t) + ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(8), uint8_t) + else { + return false; + } + +#undef ASSIGN_TVM_SCALAR + + return true; +} + } // namespace nuphar } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h index 3c26a0c6f61f9..614e1ac542553 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h +++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.h @@ -8,7 +8,10 @@ #include "core/graph/graph.h" namespace onnxruntime { -class CodeGenTarget; //forward + +//forward +class CodeGenTarget; +class Tensor; namespace nuphar { @@ -22,5 +25,6 @@ void SaveTVMModuleToCache(const std::string& filename, tvm::runtime::Module& mod std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const CodeGenTarget& codegen_target); +bool TryCreateConstantScalar(tvm::Expr& scalar, const Tensor* tensor); } // namespace nuphar } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc index 3879bda9fe66e..582ada8f3b944 100644 --- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc +++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc @@ -30,8 +30,6 @@ namespace nuphar { #define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_registry) { LIST_ALL_X86_OPS() @@ -41,8 +39,6 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re #undef POOL_OP #undef REDUCE_V_OP #undef UNARY_OP -#undef BINARY_OP -#undef BINARY_CMP_OP // END: NupharTVM X86 IR creator classes @@ -142,8 +138,6 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la #define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) static void RegisterNupharX86Dispatcher(const std::shared_ptr& builder, const tvm_codegen::OpIRRegistry* registry) { @@ -156,8 +150,6 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr #include namespace onnxruntime { @@ -121,7 +122,17 @@ class NupharCodeGenCtx : public tvm_codegen::CodeGenContext { return tvm_tensor_ctx_; } + void InsertLiteral(const std::string& str) { + literalized_scalars_.insert(str); + } + + bool CheckLiteral(const std::string& str) { + return literalized_scalars_.count(str) > 0; + } + private: + std::set literalized_scalars_; + std::unique_ptr graph_stats_; const NupharCodeGenHandle* nuphar_handle_; diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc index 6c7567e1ddca7..e5932d32a0809 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc @@ -8,6 +8,7 @@ #include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h" #include "core/codegen/passes/utils/ort_tvm_utils.h" #include "core/common/common.h" +#include "core/providers/nuphar/common/nuphar_tvm_utils.h" #include "core/providers/nuphar/compiler/initializer_info.h" #include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" @@ -28,6 +29,10 @@ static const tvm::Tensor& GetOrCreateInitializer(const NodeArg* def, bool is_sliced, NupharCodeGenCtx& ctx_codegen); +static bool CreateScalarTensorFromInitializer(const Tensor* tensor, + const std::string& name, + NupharCodeGenCtx& ctx_codegen); + // CreateInputPlaceholder create tvm input placeholder (tvm::Tensor) // NOTE: here we assume axis 0 is sequence // TODO: add support for sequence not axis 0 @@ -51,6 +56,12 @@ static bool CreateInput( return false; ORT_ENFORCE(def->Shape()); + + if (nullptr != initialized_tensor && + CreateScalarTensorFromInitializer(initialized_tensor, def->Name(), ctx_codegen)) { + return false; // constant scalar tensor do not need to be in input + } + if (nullptr != initialized_tensor) { input = GetOrCreateInitializer(def, initialized_tensor, is_sliced, ctx_codegen); } else { @@ -68,6 +79,29 @@ static bool CreateInput( return true; } +bool CreateScalarTensorFromInitializer(const Tensor* tensor, + const std::string& name, + NupharCodeGenCtx& ctx_codegen) { + TVMTensorCtx& ctx_tensor = ctx_codegen.GetTVMTensorCtx(); + ORT_ENFORCE(tensor != nullptr); + + tvm::Expr constant_scalar; + if (!TryCreateConstantScalar(constant_scalar, tensor)) + return false; + + std::string normalized_name = NormalizeCppName(name); + auto tvm_tensor = tvm::compute( + tvm_codegen::ToTvmArray(tensor->Shape().GetDims()), + [&](const tvm::Array&) { + return constant_scalar; + }, + normalized_name); + + ctx_codegen.InsertLiteral(normalized_name); + ctx_tensor.inputs.emplace(name, std::move(tvm_tensor)); + return true; +} + // GetOrCreateInitializer create tvm::placeholder for a marshalled weight // with correpsonding data layout transfomration for a weight, // Note the weight is fed during build diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc index 2755f0c01aed1..57432e2b615e4 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc @@ -47,6 +47,8 @@ static void Traverse(const tvm::Tensor& tensor, if (t->op->InputTensors().size() > 0) { auto current_node = ctx_codegen.FindNode(t); Traverse(t, current_node, ctx_codegen, ctx_schedule); + } else if (ctx_codegen.CheckLiteral(t->op->name)) { + TryInlineSchedule(t, ctx_schedule); } } } diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h index 5a37ac21a9767..c06961633f9e5 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h @@ -38,18 +38,6 @@ namespace nuphar { UNARY_OP(Softplus) \ UNARY_OP(Tanh) -#define LIST_X86_BINARY_OPS() \ - BINARY_OP(Add) \ - BINARY_OP(Div) \ - BINARY_OP(Mul) \ - BINARY_OP(Pow) \ - BINARY_OP(Sub) - -#define LIST_X86_BINARY_CMP_OPS() \ - BINARY_CMP_OP(Equal) \ - BINARY_CMP_OP(Greater) \ - BINARY_CMP_OP(Less) - #define LIST_REDUCE_V_OPS() \ REDUCE_V_OP(ReduceMax) \ REDUCE_V_OP(ReduceMin) \ @@ -58,14 +46,13 @@ namespace nuphar { #define LIST_ALL_X86_OPS() \ LIST_REDUCE_V_OPS() \ LIST_X86_POOL_OPS() \ - LIST_X86_BINARY_OPS() \ - LIST_X86_BINARY_CMP_OPS() \ LIST_X86_UNARY_OPS() \ ADD_OP_ITEM(Gemm) \ ADD_OP_ITEM(LogSoftmax) \ ADD_OP_ITEM(MatMul) \ ADD_OP_ITEM(MatMulInteger) \ ADD_OP_ITEM(MatMulInteger16) \ + ADD_OP_ITEM(Pow) \ ADD_OP_ITEM(Scatter) \ ADD_OP_ITEM(ScatterElements) \ ADD_OP_ITEM(Slice) \ @@ -76,8 +63,6 @@ namespace nuphar { #define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP) #define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(OP) ADD_OP_ITEM(OP) -#define BINARY_OP(OP) ADD_OP_ITEM(OP) -#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP) #define UNARY_OP(OP) ADD_OP_ITEM(OP) LIST_ALL_X86_OPS() @@ -85,8 +70,6 @@ LIST_ALL_X86_OPS() #undef ADD_OP_ITEM #undef REDUCE_V_OP #undef POOL_OP -#undef BINARY_OP -#undef BINARY_CMP_OP #undef UNARY_OP } // namespace nuphar diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc deleted file mode 100644 index 9af9a37326baf..0000000000000 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/binary_ops.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/tensor/cast_ops.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/common.h" -#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h" -#include "core/providers/nuphar/mti_x86/math/pow.h" - -namespace onnxruntime { -using namespace tvm_codegen; - -namespace nuphar { - -bool HandleConstantScalar(tvm::Expr& scalar, size_t i, const Node& node, CodeGenContext& ctx_codegen) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - NupharCodeGenCtx* ctx_nuphar = Promote(&ctx_codegen); - - ORT_ENFORCE(i < node.InputDefs().size()); - const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name()); - - if (!tensor || tensor->Shape().Size() > 1) - return false; // return if not constant or not scalar - -#define ASSIGN_TVM_SCALAR(tvm_type, tensor_type) \ - if (tensor->IsDataType()) { \ - scalar = tvm::make_const(tvm_type, *tensor->Data()); \ - } - -#define ASSIGN_TVM_SCALAR_ELSE(tvm_type, tensor_type) \ - else ASSIGN_TVM_SCALAR(tvm_type, tensor_type) - - ASSIGN_TVM_SCALAR(HalideIR::Float(32), float) - ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(64), int64_t) - ASSIGN_TVM_SCALAR_ELSE(HalideIR::Int(32), int32_t) - ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(64), uint64_t) - ASSIGN_TVM_SCALAR_ELSE(HalideIR::UInt(32), uint32_t) - ASSIGN_TVM_SCALAR_ELSE(HalideIR::Float(64), double) - else { - return false; - } - -#undef ASSIGN_TVM_SCALAR - - return true; -} - -// helper local macro defines Evaluate of BINARY_OP OpIRCreators -#define BINARY_OP(name) \ - Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext& ctx_codegen, \ - tvm::Array& outputs) { \ - tvm::Expr scalar0, scalar1; \ - bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \ - bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \ - tvm::Tensor Y; \ - if (use_scalar0) \ - Y = name(scalar0, inputs[1], node.Name()); \ - else if (use_scalar1) \ - Y = name(inputs[0], scalar1, node.Name()); \ - else \ - Y = name(inputs[0], inputs[1], node.Name()); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_X86_BINARY_OPS() - -#undef BINARY_OP - -// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators -#define BINARY_CMP_OP(name) \ - Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext& ctx_codegen, \ - tvm::Array& outputs) { \ - tvm::Expr scalar0, scalar1; \ - bool use_scalar0 = HandleConstantScalar(scalar0, 0, node, ctx_codegen); \ - bool use_scalar1 = HandleConstantScalar(scalar1, 1, node, ctx_codegen); \ - tvm::Tensor Y; \ - if (use_scalar0) \ - Y = name(scalar0, inputs[1], node.Name()); \ - else if (use_scalar1) \ - Y = name(inputs[0], scalar1, node.Name()); \ - else \ - Y = name(inputs[0], inputs[1], node.Name()); \ - Y = Cast(Y, HalideIR::UInt(8), "cast_bool_" #name); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_X86_BINARY_CMP_OPS() - -#undef BINARY_CMP_OP - -} // namespace nuphar -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc index f766fe98ad7c1..9351aa996aab4 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/matmul.cc @@ -134,7 +134,7 @@ static bool MatMulF32ExternCPU( const std::vector* p_permute_B = nullptr; tvm::Tensor root_A = find_transposed_input(A, permute_A); tvm::Tensor root_B = find_transposed_input(B, permute_B); - if (A->shape.size() == B->shape.size() && A->shape.size() > 2) { + if (A->shape.size() == B->shape.size() && A->shape.size() >= 2) { // currently only fuse Transpose into MatMul when rank(A) == rank(B) // make sure no broadcasting in MatMul bool no_broadcast = true; diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc new file mode 100644 index 0000000000000..7b1d04d1a944c --- /dev/null +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/pow.cc @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/op_kernel_info.h" +#include "core/providers/nuphar/common/nuphar_tvm_utils.h" +#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h" +#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h" +#include "core/providers/nuphar/mti_x86/math/pow.h" + +namespace onnxruntime { +namespace nuphar { + +Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Pow)::Evaluate( + const tvm::Array& inputs, + const Node& node, + tvm_codegen::CodeGenContext& ctx_codegen, + tvm::Array& outputs) { + ORT_ENFORCE(inputs.size() == 2); + + struct { + tvm::Expr expr; + bool is_scalar; + } constant_scalars[2]; + + for (size_t i = 0; i < 2; ++i) { + ProtoHelperNodeContext ctx(node); + OpNodeProtoHelper info(&ctx); + NupharCodeGenCtx* ctx_nuphar = Promote(&ctx_codegen); + + ORT_ENFORCE(i < node.InputDefs().size()); + const auto* tensor = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[i]->Name()); + constant_scalars[i].is_scalar = TryCreateConstantScalar(constant_scalars[i].expr, tensor); + } + tvm::Tensor Y; + if (constant_scalars[0].is_scalar) + Y = Pow(constant_scalars[0].expr, inputs[1], node.Name()); + else if (constant_scalars[1].is_scalar) + Y = Pow(inputs[0], constant_scalars[1].expr, node.Name()); + else + Y = Pow(inputs[0], inputs[1], node.Name()); + outputs.push_back(Y); + return Status::OK(); +} + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc index d78d2473d81e2..3e193517a71cf 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc @@ -305,6 +305,23 @@ tvm::Tensor ReduceMin(const tvm::Tensor& X, X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name); } +tvm::Tensor ReduceMean(const tvm::Tensor& X, + const std::vector& axes, bool keep_dims, + const int32_t vector_size, + bool last_dim_aligned, + int32_t fuse_dim, + const std::string& name) { + tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims, + X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name); + + tvm::Expr count = tvm::make_const(HalideIR::Int(32), 1); + for (auto ax : axes) { + ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size()); + count = count * X->shape[ax]; + } + return topi::divide(sum, tvm::cast(X->dtype, count)); +} + // [WIP] a special vectorization friendly value reduction // Keep_dim always true tvm::Tensor ReduceValueLowest_noPad(const tvm::Tensor& X, diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h index 4f15255df1ab5..eca60c2eb1cbe 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.h @@ -36,5 +36,12 @@ tvm::Tensor ReduceMin(const tvm::Tensor& X, int32_t fuse_dim = 0, const std::string& name = "reduce_min_v"); +tvm::Tensor ReduceMean(const tvm::Tensor& X, + const std::vector& axes, bool keep_dims, + const int32_t vector_size, + bool last_dim_aligned = false, + int32_t fuse_dim = 0, + const std::string& name = "reduce_mean_v"); + } // namespace nuphar } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc index 7ffe9322bc2d5..29c5339f6ebef 100644 --- a/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc +++ b/onnxruntime/core/providers/nuphar/partition/subgraph_partitioner.cc @@ -189,12 +189,9 @@ Status SubgraphPartitioner::Partition( bool unused_initializer = false; if (t != nullptr) { // note for Reshape and Tile, shape/repeats as initializer is not used at runtime - // scalar initializers in binary ops are not used at runtime either - static const std::unordered_set binary_ops = - {"Add", "Div", "Sub", "Mul", "Pow", "Equal", "Greater", "Less"}; - + // neither for any scalar unused_initializer = ((node.OpType() == "Reshape" || node.OpType() == "Tile") && i == 1) || - (binary_ops.count(node.OpType()) > 0 && t->Shape().Size() == 1); + t->Shape().Size() == 1; if (!unused_initializer) { subgraph.initializers.emplace(def.Name(), t); From a4d8a394c8305e5705381e7c8fc5ed4b73d96329 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 00:01:41 -0800 Subject: [PATCH 05/11] Missing reduce mean --- cmake/CMakeLists.txt | 2 +- .../core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 52f0d60458ef9..4a8a4c3f7e189 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -188,7 +188,7 @@ if (MSVC) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100") if (NOT onnxruntime_USE_CUDA) SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL") - SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") + #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") endif() check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE) if (HAS_QSPECTRE) diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h index c06961633f9e5..84868996cf8d9 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h @@ -41,7 +41,8 @@ namespace nuphar { #define LIST_REDUCE_V_OPS() \ REDUCE_V_OP(ReduceMax) \ REDUCE_V_OP(ReduceMin) \ - REDUCE_V_OP(ReduceSum) + REDUCE_V_OP(ReduceSum) \ + REDUCE_V_OP(ReduceMean) #define LIST_ALL_X86_OPS() \ LIST_REDUCE_V_OPS() \ From 2f3f06cae57890c654d6ce7dfb77092aaa719989 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 00:11:28 -0800 Subject: [PATCH 06/11] undo unintended change --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 4a8a4c3f7e189..52f0d60458ef9 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -188,7 +188,7 @@ if (MSVC) SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4100") if (NOT onnxruntime_USE_CUDA) SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL") - #SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") + SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL") endif() check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE) if (HAS_QSPECTRE) From 8f9fad5289cabd2b48d0f26acf71ecea44816f8c Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 15:23:36 -0800 Subject: [PATCH 07/11] Treat constant initializers that has same value for all elements as scalar --- .../nuphar/common/nuphar_tvm_utils.cc | 37 ++++++++++++++++++- .../nuphar/mti_x86/math/matmul_ops.cc | 4 +- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc index 7ea0407c73d5c..7f3e2dda0d0e4 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc +++ b/onnxruntime/core/providers/nuphar/common/nuphar_tvm_utils.cc @@ -173,8 +173,41 @@ std::string GetPackedFuncName(const nuphar::NupharSubgraphUnit& subgraph, const bool TryCreateConstantScalar( tvm::Expr& scalar, const Tensor* tensor) { - if (!tensor || tensor->Shape().Size() > 1) - return false; // return if not constant or not scalar + if (!tensor) + return false; + + auto num_elements = tensor->Shape().Size(); + if (num_elements > 1) { + // for non-scalar, only fold to constant scalar when all values are identical + const auto& dtype = tensor->DataType(); + auto elem_size = dtype->Size(); + const void* data = tensor->DataRaw(); + +#define CHECK_ALL_TENSOR_SAME(T) \ + for (int64_t i = 1; i < num_elements; ++i) { \ + if (reinterpret_cast(data)[i] != reinterpret_cast(data)[0]) \ + return false; \ + } + + switch (elem_size) { + case 1: + CHECK_ALL_TENSOR_SAME(int8_t); + break; + case 2: + CHECK_ALL_TENSOR_SAME(int16_t); + break; + case 4: + CHECK_ALL_TENSOR_SAME(int32_t); + break; + case 8: + CHECK_ALL_TENSOR_SAME(int64_t); + break; + default: + return false; + } + +#undef CHECK_ALL_TENSOR_SAME + } #define ASSIGN_TVM_SCALAR(tvm_type, tensor_type) \ if (tensor->IsDataType()) { \ diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc index 141f70c572505..8fbbc60d72564 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/matmul_ops.cc @@ -157,9 +157,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.batched_matmul_cpu") int64_t N = B->shape[permute_B[B->ndim - 1]]; bool trans_a = (permute_A[A->ndim - 2] == A->ndim - 1); bool trans_b = (permute_B[B->ndim - 2] == B->ndim - 1); - int64_t step_a = stride_A[permute_A[A->ndim - 3]]; + int64_t step_a = num_matmuls > 1 ? stride_A[permute_A[A->ndim - 3]] : 0; int64_t lda = stride_A[permute_A[A->ndim - (trans_a ? 1 : 2)]]; - int64_t step_b = stride_B[permute_B[B->ndim - 3]]; + int64_t step_b = num_matmuls > 1 ? stride_B[permute_B[B->ndim - 3]] : 0; int64_t ldb = stride_B[permute_B[B->ndim - (trans_b ? 1 : 2)]]; for (int i = 0; i < num_matmuls; i++) { From f4345bf83b996dc67b35d417221acfcf58ca896b Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 16:24:34 -0800 Subject: [PATCH 08/11] Fix reduce test --- .../providers/nuphar/mti_x86/math/reduce_ops.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc index 3e193517a71cf..df35a42d318f0 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc @@ -314,10 +314,16 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X, tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims, X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name); - tvm::Expr count = tvm::make_const(HalideIR::Int(32), 1); - for (auto ax : axes) { - ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size()); - count = count * X->shape[ax]; + tvm::Expr count; + if (axes.size() > 0) { + count = tvm::make_const(HalideIR::Int(32), 1); + for (auto ax : axes) { + ax = tvm_codegen::HandleNegativeAxis(ax, X->shape.size()); + count = count * X->shape[ax]; + } + } else { + // by default, reduce over all axes + count = tvm_codegen::SizeFromDimension(X->shape, 0); } return topi::divide(sum, tvm::cast(X->dtype, count)); } From 3836a2a8b9686a5358e9a5b729e4450eb0cfd74c Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 17:03:56 -0800 Subject: [PATCH 09/11] Fix reduce mean tests --- .../core/providers/nuphar/mti_x86/math/reduce_ops.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc index df35a42d318f0..e816c77ed90ab 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc @@ -311,9 +311,7 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X, bool last_dim_aligned, int32_t fuse_dim, const std::string& name) { - tvm::Tensor sum = ReduceValue(X, tvm::sum, axes, keep_dims, - X->dtype.max(), vector_size, last_dim_aligned, fuse_dim, name); - + tvm::Tensor sum = ReduceSum(X, axes, keep_dims, vector_size, last_dim_aligned, fuse_dim, name + "_sum"); tvm::Expr count; if (axes.size() > 0) { count = tvm::make_const(HalideIR::Int(32), 1); @@ -325,7 +323,7 @@ tvm::Tensor ReduceMean(const tvm::Tensor& X, // by default, reduce over all axes count = tvm_codegen::SizeFromDimension(X->shape, 0); } - return topi::divide(sum, tvm::cast(X->dtype, count)); + return topi::divide(sum, tvm::cast(X->dtype, count), name + "_div"); } // [WIP] a special vectorization friendly value reduction From 5d8730f54b1388828f87ff771d203e306a9080d2 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Wed, 13 Nov 2019 22:42:15 -0800 Subject: [PATCH 10/11] Improve symbolic shape inference --- .../nuphar/scripts/symbolic_shape_infer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py index b59bc9a9411df..48becfede31c8 100644 --- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py +++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py @@ -48,6 +48,14 @@ def as_scalar(x): else: return x +def as_list(x): + if type(x) == list: + return x + elif type(x) == np.ndarray: + return list(x) + else: + return [x] + def sympy_reduce_product(x): if type(x) == list: value = sympy.Integer(1) @@ -809,14 +817,16 @@ def _infer_Slice(self, node): ends = get_attribute(node, 'ends') steps = [1]*len(axes) else: - starts = self._try_get_value(node, 1) - ends = self._try_get_value(node, 2) + starts = as_list(self._try_get_value(node, 1)) + ends = as_list(self._try_get_value(node, 2)) axes = self._try_get_value(node, 3) steps = self._try_get_value(node, 4) if axes is None and not (starts is None and ends is None): axes = list(range(0, len(starts if starts is not None else ends))) if steps is None and not (starts is None and ends is None): steps = [1]*len(starts if starts is not None else ends) + axes = as_list(axes) + steps = as_list(steps) new_sympy_shape = self._get_sympy_shape(node, 0) if starts is None or ends is None: From 3b1e5407e64494e6d4df5ce93a1072de75915943 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Thu, 14 Nov 2019 00:34:58 -0800 Subject: [PATCH 11/11] Minor updates for better debugging --- onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h | 4 ---- onnxruntime/core/providers/nuphar/runtime/exec_block.cc | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h b/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h index 06e105150ad54..52f950a648461 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h +++ b/onnxruntime/core/providers/nuphar/common/nuphar_subgraph.h @@ -83,10 +83,6 @@ struct NupharSubgraphUnit { return nodes.size() == 1; } - const std::string& Name() const { - return nodes.front()->Name(); - } - std::string UniqueId() const { return std::to_string(id_); } diff --git a/onnxruntime/core/providers/nuphar/runtime/exec_block.cc b/onnxruntime/core/providers/nuphar/runtime/exec_block.cc index 26feec129758e..6aed253ab7d43 100644 --- a/onnxruntime/core/providers/nuphar/runtime/exec_block.cc +++ b/onnxruntime/core/providers/nuphar/runtime/exec_block.cc @@ -18,10 +18,10 @@ void CreateExecBlock(std::vector>& exec_blocks, bool /*enable_tiling*/) { if (subgraph.IsSingleNode() && subgraph.nodes.front()->OpType() == "Scan") { exec_blocks.push_back( - std::move(onnxruntime::make_unique(func_info, "nuphar_exec_" + subgraph.Name()))); + std::move(onnxruntime::make_unique(func_info, "nuphar_exec_" + subgraph.UniqueId()))); } else { exec_blocks.push_back( - std::move(onnxruntime::make_unique(func_info, "nuphar_exec_" + subgraph.Name()))); + std::move(onnxruntime::make_unique(func_info, "nuphar_exec_" + subgraph.UniqueId()))); } }