Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

mv fused_bias_dropout_residual_ln to fluid manual dir #48824

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,16 @@ paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function(
const paddle::experimental::Tensor& Y,
const paddle::experimental::Tensor& Bias,
const paddle::framework::AttributeMap& attr_map);

std::tuple<paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor>
fused_bias_dropout_residual_layer_norm_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Residual,
const paddle::experimental::Tensor& Bias,
const paddle::experimental::Tensor& LnScale,
const paddle::experimental::Tensor& LnBias,
const paddle::framework::AttributeMap& attr_map);
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ set(fluid_manual_functions
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
PARENT_SCOPE)
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/amp_auto_cast.h"
#include "paddle/fluid/eager/amp_utils.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

std::tuple<paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor>
fused_bias_dropout_residual_layer_norm_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Residual,
const paddle::experimental::Tensor& Bias,
const paddle::experimental::Tensor& LnScale,
const paddle::experimental::Tensor& LnBias,
const paddle::framework::AttributeMap& attr_map) {
paddle::platform::RecordEvent dygraph_entrance_record_event(
"fused_bias_dropout_residual_layer_norm dygraph",
paddle::platform::TracerEventType::Operator,
1);
VLOG(3) << "Running Eager Forward Op: fused_bias_dropout_residual_layer_norm";
// Dygraph Forward Pass

if (egr::Controller::Instance().GetAMPLevel() !=
paddle::imperative::AmpLevel::O0) {
VLOG(5) << "Check and Prepare For AMP";

paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
amp_tensors_vector = {{X}, {Residual}};
if (Bias.initialized()) amp_tensors_vector.push_back({Bias});
if (LnScale.initialized()) amp_tensors_vector.push_back({LnScale});
if (LnBias.initialized()) amp_tensors_vector.push_back({LnBias});

auto amp_dst_dtype = egr::GetAmpDestDtype(
"fused_bias_dropout_residual_layer_norm", amp_tensors_vector);

auto NEW_X = egr::AmpAutoCast(
"X", X, amp_dst_dtype, "fused_bias_dropout_residual_layer_norm");
auto NEW_Residual =
egr::AmpAutoCast("Residual",
Residual,
amp_dst_dtype,
"fused_bias_dropout_residual_layer_norm");
auto NEW_Bias =
((Bias.initialized())
? egr::AmpAutoCast("Bias",
Bias,
amp_dst_dtype,
"fused_bias_dropout_residual_layer_norm")
: Bias);
auto NEW_LnScale =
((LnScale.initialized())
? egr::AmpAutoCast("LnScale",
LnScale,
amp_dst_dtype,
"fused_bias_dropout_residual_layer_norm")
: LnScale);
auto NEW_LnBias =
((LnBias.initialized())
? egr::AmpAutoCast("LnBias",
LnBias,
amp_dst_dtype,
"fused_bias_dropout_residual_layer_norm")
: LnBias);

{
paddle::imperative::AutoCastGuard guard(
egr::Controller::Instance().GetCurrentTracer(),
paddle::imperative::AmpLevel::O0);
return fused_bias_dropout_residual_layer_norm_dygraph_function(
NEW_X, NEW_Residual, NEW_Bias, NEW_LnScale, NEW_LnBias, attr_map);
}
}

std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
{{"X", egr::EagerUtils::TrySyncToVars(X)},
{"Residual", egr::EagerUtils::TrySyncToVars(Residual)}};
if (Bias.initialized()) ins["Bias"] = egr::EagerUtils::TrySyncToVars(Bias);
if (LnScale.initialized())
ins["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale);
if (LnBias.initialized())
ins["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias);

std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
{{"BiasDropoutResidualOut",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"DropoutMaskOut",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"LnMean",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"LnVariance",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Y",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}}};

// Prepare Autograd Meta
egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X);
egr::AutogradMeta* p_autograd_Residual =
egr::EagerUtils::nullable_autograd_meta(Residual);
egr::AutogradMeta* p_autograd_Bias =
egr::EagerUtils::nullable_autograd_meta(Bias);
egr::AutogradMeta* p_autograd_LnScale =
egr::EagerUtils::nullable_autograd_meta(LnScale);
egr::AutogradMeta* p_autograd_LnBias =
egr::EagerUtils::nullable_autograd_meta(LnBias);

bool trace_backward = egr::Controller::Instance().HasGrad();

bool require_any_grad =
egr::EagerUtils::ComputeRequireGrad(trace_backward,
p_autograd_X,
p_autograd_Residual,
p_autograd_Bias,
p_autograd_LnScale,
p_autograd_LnBias);

paddle::framework::AttributeMap attrs = attr_map;
paddle::framework::AttributeMap default_attrs;
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_bias_dropout_residual_layer_norm",
ins,
outs,
attrs,
egr::Controller::Instance().GetExpectedPlace(),
&default_attrs,
true,
{});

paddle::experimental::Tensor BiasDropoutResidualOut;
egr::EagerUtils::GetOutput(outs["BiasDropoutResidualOut"][0],
&BiasDropoutResidualOut);
paddle::experimental::Tensor DropoutMaskOut;
egr::EagerUtils::GetOutput(outs["DropoutMaskOut"][0], &DropoutMaskOut);
paddle::experimental::Tensor LnMean;
egr::EagerUtils::GetOutput(outs["LnMean"][0], &LnMean);
paddle::experimental::Tensor LnVariance;
egr::EagerUtils::GetOutput(outs["LnVariance"][0], &LnVariance);
paddle::experimental::Tensor Y;
egr::EagerUtils::GetOutput(outs["Y"][0], &Y);

{
paddle::platform::RecordEvent node_creation_record_event(
"fused_bias_dropout_residual_layer_norm node_creation",
paddle::platform::TracerEventType::OperatorInner,
1);
egr::AutogradMeta* p_autograd_BiasDropoutResidualOut =
egr::EagerUtils::autograd_meta(&BiasDropoutResidualOut);
egr::AutogradMeta* p_autograd_DropoutMaskOut =
egr::EagerUtils::autograd_meta(&DropoutMaskOut);
egr::AutogradMeta* p_autograd_LnMean =
egr::EagerUtils::autograd_meta(&LnMean);
egr::AutogradMeta* p_autograd_LnVariance =
egr::EagerUtils::autograd_meta(&LnVariance);
egr::AutogradMeta* p_autograd_Y = egr::EagerUtils::autograd_meta(&Y);
if (require_any_grad) {
VLOG(6) << " Construct Grad for fused_bias_dropout_residual_layer_norm ";
egr::EagerUtils::PassStopGradient(false,
p_autograd_BiasDropoutResidualOut,
p_autograd_DropoutMaskOut,
p_autograd_LnMean,
p_autograd_LnVariance,
p_autograd_Y);
// Create GradOpNode
auto grad_node =
std::shared_ptr<fused_bias_dropout_residual_layer_normGradNodeCompat>(
new fused_bias_dropout_residual_layer_normGradNodeCompat(5, 5));

// Set Attributes
grad_node->SetAttrMap(std::move(attrs));
grad_node->SetDefaultAttrMap(std::move(default_attrs));

// Set Tensor Wrappers
grad_node->SetTensorWrapperBias(Bias);
grad_node->SetTensorWrapperBiasDropoutResidualOut(BiasDropoutResidualOut);
grad_node->SetTensorWrapperDropoutMaskOut(DropoutMaskOut);
grad_node->SetTensorWrapperLnBias(LnBias);
grad_node->SetTensorWrapperLnMean(LnMean);
grad_node->SetTensorWrapperLnScale(LnScale);
grad_node->SetTensorWrapperLnVariance(LnVariance);
grad_node->SetTensorWrapperResidual(Residual);
grad_node->SetTensorWrapperX(X);

grad_node->SetGradOutMeta(X, 0);
grad_node->SetGradOutMeta(Residual, 1);
grad_node->SetGradOutMeta(Bias, 2);
grad_node->SetGradOutMeta(LnScale, 3);
grad_node->SetGradOutMeta(LnBias, 4);

egr::EagerUtils::SetOutRankWithSlot(p_autograd_BiasDropoutResidualOut, 0);
grad_node->SetGradInMeta(BiasDropoutResidualOut, 0);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_DropoutMaskOut, 1);
grad_node->SetGradInMeta(DropoutMaskOut, 1);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnMean, 2);
grad_node->SetGradInMeta(LnMean, 2);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnVariance, 3);
grad_node->SetGradInMeta(LnVariance, 3);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Y, 4);
egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
grad_node->SetGradInMeta(Y, 4);
egr::EagerUtils::CheckAndRetainGrad(Y);
}
}

return std::make_tuple(
BiasDropoutResidualOut, DropoutMaskOut, LnMean, LnVariance, Y);
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ set(fluid_manual_nodes
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_bias_dropout_residual_layer_norm_node.cc
PARENT_SCOPE)
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "glog/logging.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/all.h"

paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
fused_bias_dropout_residual_layer_normGradNodeCompat::operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>& grads,
bool create_graph,
bool is_new_grad) {
const auto& out_metas = OutputMeta();
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
outputs(5);
VLOG(3) << "Running Eager Backward Node: "
"fused_bias_dropout_residual_layer_normGradNodeCompat";
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
hooked_grads0 = fused_bias_dropout_residual_layer_normGradNodeCompat::
ApplyGradientHooks(grads);
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
{{"BiasDropoutResidualOut",
egr::EagerUtils::TrySyncToVars(egr::EagerUtils::RecoverTensorWrapper(
&this->BiasDropoutResidualOut_))},
{"DropoutMaskOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->DropoutMaskOut_))},
{"LnMean",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->LnMean_))},
{"LnVariance",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->LnVariance_))},
{"Residual",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Residual_))},
{"X",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->X_))},
{"Y@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[4])}};

auto Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Bias_);

if (Bias.defined()) ins0["Bias"] = egr::EagerUtils::TrySyncToVars(Bias);

auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_);
if (LnBias.defined()) ins0["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias);
auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_);
if (LnScale.defined())
ins0["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale);
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;

if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
outs0.insert({"BiasDropoutResidualOut@GRAD",
egr::EagerUtils::TrySyncToVars(hooked_grads0[0])});
}
if ((!out_metas[1].empty()) && (!(out_metas[1][0].IsStopGradient()))) {
outs0.insert({"Residual@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
outs0.insert({"X@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}

if (Bias.defined() && (!out_metas[2].empty()) &&
(!out_metas[2][0].IsStopGradient()))
outs0["Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (LnBias.defined() && (!out_metas[4].empty()) &&
(!out_metas[4][0].IsStopGradient()))
outs0["LnBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (LnScale.defined() && (!out_metas[3].empty()) &&
(!out_metas[3][0].IsStopGradient()))
outs0["LnScale@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto& attrs_map0 = this->attr_map_;
// Pass the entire attribute map to TraceOp
// The underlying kernel will pickup whatever attribute they need at runtime

egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_bias_dropout_residual_layer_norm_grad",
ins0,
outs0,
attrs_map0,
egr::Controller::Instance().GetExpectedPlace(),
&this->default_attr_map_,
false,
{});

if (outs0.find("Bias@GRAD") != outs0.end()) {
outputs[2] = egr::EagerUtils::GetOutputs(outs0["Bias@GRAD"]);
}

if (outs0.find("LnBias@GRAD") != outs0.end()) {
outputs[4] = egr::EagerUtils::GetOutputs(outs0["LnBias@GRAD"]);
}

if (outs0.find("LnScale@GRAD") != outs0.end()) {
outputs[3] = egr::EagerUtils::GetOutputs(outs0["LnScale@GRAD"]);
}

if (outs0.find("Residual@GRAD") != outs0.end()) {
outputs[1] = egr::EagerUtils::GetOutputs(outs0["Residual@GRAD"]);
}

if (outs0.find("X@GRAD") != outs0.end()) {
outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]);
}

if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
return outputs;
}
Loading