Skip to content

Commit

Permalink
Fix inconsistent mux state (#92)
Browse files Browse the repository at this point in the history
What is the motivation for this PR?
This issue occurs when running config load_minigraph to load new configs at both ToRs.
After write_standby.py, the muxorch will try to direct all traffic downstream to the SoC IP and server IP to the tunnel, but xcvrd might fail to set the adminforwardingstate of the port via gRPC because the gRPC channel could not be established because the other ToR is in standby state.
After linkmgrd starts to run and receives heartbeats from itself, it will try to toggle to active[toggle#1], but xcvrd might not be able to make the hardware toggle at the moment, so linkmgrd will mux wait. Also, because the mux probe table is initialized with Unknown state, linkmgrd will handle the initial mux probe state to have the composite states (active, unknown, up) and tries to probe the mux state.
As the muxorch has been toggled to active[toggle#1], the gRPC channel will be established at some point after, xcvrd will be able to answer the mux probe, so the linkmgrd will be able to change into (active, active, up) state.
But the toggling to active[toggle#1] is only finished half way, the mux status in STATE_DB:MUX_CABLE_TABLE is not updated, so show mux status will show unknown for those ports.

Signed-off-by: Longxiang Lyu lolv@microsoft.com

How did you do it?
When the linkmgrd changes into states (active, active, up) and has the original mux state as unknown, it will toggle the mux to active again to have those DB tables updated:

linkmgrd  -> APP_DB:MUX_CABLE_TABLE -> swss -> APP_DB:HW_MUX_CABLE_TABLE -> xcvrd 
xcvrd -> STATE_DB:HW_MUX_CABLE_TABLE -> swss -> STATE_DB:MUX_CABLE_TABLE -> linkmgrd  
How did you verify/test it?
On dualtor-mixed topo with icmp_responder running, do config load_minigraph on both ToRs, verify the show mux status on both ToRs.

Signed-off-by: Longxiang Lyu <lolv@microsoft.com>
  • Loading branch information
lolyu authored and yxieca committed Jun 28, 2022
1 parent 59334be commit a828e86
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 13 deletions.
28 changes: 25 additions & 3 deletions src/link_manager/LinkManagerStateMachineActiveActive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,15 @@ void ActiveActiveStateMachine::initializeTransitionFunctionTable()
MUXLOGWARNING("Initializing State Transition Table...");
LinkManagerStateMachineBase::initializeTransitionFunctionTable();

mStateTransitionHandler[link_prober::LinkProberState::Label::Active]
[mux_state::MuxState::Label::Active]
[link_state::LinkState::Label::Up] =
boost::bind(
&ActiveActiveStateMachine::LinkProberActiveMuxActiveLinkUpTransitionFunction,
this,
boost::placeholders::_1
);

mStateTransitionHandler[link_prober::LinkProberState::Label::Active]
[mux_state::MuxState::Label::Standby]
[link_state::LinkState::Label::Up] =
Expand Down Expand Up @@ -580,6 +589,19 @@ void ActiveActiveStateMachine::initializeTransitionFunctionTable()
);
}

//
// ---> LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState);
//
// transition function when entering {LinkProberActive, MuxActive, LinkUp} state
//
void ActiveActiveStateMachine::LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState)
{
MUXLOGINFO(mMuxPortConfig.getPortName());
if (ms(mCompositeState) == mux_state::MuxState::Unknown) {
switchMuxState(nextState, mux_state::MuxState::Label::Active);
}
}

//
// ---> LinkProberActiveMuxStandbyLinkUpTransitionFunction(CompositeState &nextState);
//
Expand Down Expand Up @@ -1002,9 +1024,9 @@ void ActiveActiveStateMachine::handlePeerMuxWaitTimeout(boost::system::error_cod
}
}

//
//
// ---> handleDefaultRouteStateNotification(const DefaultRoute routeState);
//
//
// handle default route state notification from routeorch
//
void ActiveActiveStateMachine::handleDefaultRouteStateNotification(const DefaultRoute routeState)
Expand All @@ -1029,7 +1051,7 @@ void ActiveActiveStateMachine::shutdownOrRestartLinkProberOnDefaultRoute()
if (mMuxPortConfig.getMode() == common::MuxPortConfig::Mode::Auto && mDefaultRouteState == DefaultRoute::NA) {
mShutdownTxFnPtr();
} else {
// If mux mode is in manual/standby/active mode, we should restart link prober.
// If mux mode is in manual/standby/active mode, we should restart link prober.
// If default route state is "ok", we should retart link prober.
mRestartTxFnPtr();
}
Expand Down
9 changes: 9 additions & 0 deletions src/link_manager/LinkManagerStateMachineActiveActive.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,15 @@ class ActiveActiveStateMachine : public LinkManagerStateMachineBase,
*/
void initializeTransitionFunctionTable() override;

/**
* @method LinkProberActiveMuxActiveLinkUpTransitionFunction
*
* @brief transition function when entering {LinkProberActive, MuxActive, LinkUp} state
*
* @param nextState reference to composite state
*/
void LinkProberActiveMuxActiveLinkUpTransitionFunction(CompositeState &nextState);

/**
* @method LinkProberActiveMuxStandbyLinkUpTransitionFunction
*
Expand Down
51 changes: 41 additions & 10 deletions test/LinkManagerStateMachineActiveActiveTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActiveLinkProberPeerUnknown)
TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActiveConfigDetachedLinkProberPeerUnknown)
{
setMuxActive();

postPeerLinkProberEvent(link_prober::LinkProberState::PeerActive);
VALIDATE_PEER_STATE(PeerActive, Active);

Expand Down Expand Up @@ -415,26 +415,57 @@ TEST_F(LinkManagerStateMachineActiveActiveTest, MuxStandbyLinkProberPeerUnknown)
EXPECT_EQ(mDbInterfacePtr->mSetPeerMuxStateInvokeCount, 0);
}

TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActivDefaultRouteState)
TEST_F(LinkManagerStateMachineActiveActiveTest, MuxActivDefaultRouteState)
{
setMuxActive();

postDefaultRouteEvent("ok", 1);
EXPECT_FALSE(mMuxConfig.getIfEnableDefaultRouteFeature());
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,0);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 0);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 1);

postDefaultRouteEvent("na", 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,0);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,2);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 0);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 2);

mMuxConfig.enableDefaultRouteFeature(true);
postDefaultRouteEvent("na", 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,2);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 2);

postDefaultRouteEvent("ok", 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount,1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount,3);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mShutdownTxProbeCallCount, 1);
EXPECT_EQ(mFakeMuxPort.mFakeLinkProber->mRestartTxProbeCallCount, 3);
}

TEST_F(LinkManagerStateMachineActiveActiveTest, LinkmgrdBootupSequenceHeartBeatFirst)
{
activateStateMachine();
VALIDATE_STATE(Wait, Wait, Down);

postLinkEvent(link_state::LinkState::Up);
VALIDATE_STATE(Wait, Wait, Up);

// the first toggle fails because the the inital mux state
// is standby when linkmgrd first boots up
postLinkProberEvent(link_prober::LinkProberState::Active, 4);
VALIDATE_STATE(Active, Active, Up);
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 1);
EXPECT_EQ(mDbInterfacePtr->mLastSetMuxState, mux_state::MuxState::Label::Active);

// now linkmgrd should be stuck in mux wait timeout

handleProbeMuxState("unknown", 3);
VALIDATE_STATE(Active, Unknown, Up);

// now linkmgrd should be stuck in mux probe timeout
runIoService(4);

// xcvrd now answers the mux probe
handleProbeMuxState("active", 4);
VALIDATE_STATE(Active, Active, Up);
EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 2);
EXPECT_EQ(mDbInterfacePtr->mLastSetMuxState, mux_state::MuxState::Label::Active);
}

} /* namespace test */

0 comments on commit a828e86

Please # to comment.