From f6660119967d08a2286553c6ea4a3aed618d0c9a Mon Sep 17 00:00:00 2001 From: Shuotian Cheng Date: Tue, 6 Nov 2018 02:26:16 -0800 Subject: [PATCH] [teammgrd]: Add retry logic when enslaving member port into team (#669) * [teammgrd]: Add retry logic when enslaving member port into team When a port is not set as admin down, teamd will prevent the port from being added into the port channel. This could happen when some other processes or users are executing commands to bring up the port. This commit will try three times of enslaving before return a false. Example error message when adding a port which is admin up: libteamdctl: cli_usock_process_msg: usock: Error message received: "PortAddFail" libteamdctl: cli_usock_process_msg: usock: Error message content: "Failed to add port." command call failed (Invalid argument) * [teammgrd]: Check the admin status after the teamdctl command failure Instead of having a retry logic with time limitations, it is better to confirm the exact reason of the failure - the member port is admin up or not. The function checkPortIffUp() is added to get the netdev flags and check the IFF_UP flag. The add member logic will only retry under the circumstance that the teamdctl command fails and the member port is still UP. Signed-off-by: Shu0T1an ChenG --- cfgmgr/teammgr.cpp | 105 +++++++++++++++++++++++++++++++++++++-------- cfgmgr/teammgr.h | 4 +- 2 files changed, 89 insertions(+), 20 deletions(-) diff --git a/cfgmgr/teammgr.cpp b/cfgmgr/teammgr.cpp index 47bbb8f434a9..c68e0bcaaca1 100644 --- a/cfgmgr/teammgr.cpp +++ b/cfgmgr/teammgr.cpp @@ -1,3 +1,5 @@ +#include + #include "exec.h" #include "teammgr.h" #include "logger.h" @@ -8,6 +10,9 @@ #include #include +#include +#include + using namespace std; using namespace swss; @@ -182,7 +187,7 @@ void TeamMgr::doLagMemberTask(Consumer &consumer) { KeyOpFieldsValuesTuple t = it->second; - auto tokens = tokenize(kfvKey(t), '|'); + auto tokens = tokenize(kfvKey(t), config_db_key_delimiter); auto lag = tokens[0]; auto member = tokens[1]; @@ -196,7 +201,11 @@ void TeamMgr::doLagMemberTask(Consumer &consumer) continue; } - addLagMember(lag, member); + if (addLagMember(lag, member) == task_need_retry) + { + it++; + continue; + } } else if (op == DEL_COMMAND) { @@ -207,6 +216,49 @@ void TeamMgr::doLagMemberTask(Consumer &consumer) } } +bool TeamMgr::checkPortIffUp(const string &port) +{ + SWSS_LOG_ENTER(); + + struct ifreq ifr; + memcpy(ifr.ifr_name, port.c_str(), strlen(port.c_str())); + ifr.ifr_name[strlen(port.c_str())] = 0; + + int fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (fd == -1 || ioctl(fd, SIOCGIFFLAGS, &ifr) == -1) + { + SWSS_LOG_ERROR("Failed to get port %s flags", port.c_str()); + return false; + } + + SWSS_LOG_INFO("Get port %s flags %i", port.c_str(), ifr.ifr_flags); + + return ifr.ifr_flags & IFF_UP; +} + +bool TeamMgr::findPortMaster(string &master, const string &port) +{ + SWSS_LOG_ENTER(); + + vector keys; + m_cfgLagMemberTable.getKeys(keys); + + for (auto key: keys) + { + auto tokens = tokenize(key, config_db_key_delimiter); + auto lag = tokens[0]; + auto member = tokens[1]; + + if (port == member) + { + master = lag; + return true; + } + } + + return false; +} + // When a port gets removed and created again, notification is triggered // when state dabatabase gets updated. In this situation, the port needs // to be enslaved into the LAG again. @@ -226,21 +278,13 @@ void TeamMgr::doPortUpdateTask(Consumer &consumer) { SWSS_LOG_INFO("Received port %s state update", alias.c_str()); - vector keys; - m_cfgLagMemberTable.getKeys(keys); - - for (auto key : keys) + string lag; + if (findPortMaster(lag, alias)) { - auto tokens = tokenize(key, '|'); - - auto lag = tokens[0]; - auto member = tokens[1]; - - // Find the master of the port - if (alias == member) + if (addLagMember(lag, alias) == task_need_retry) { - addLagMember(lag, alias); - break; + it++; + continue; } } } @@ -291,7 +335,7 @@ bool TeamMgr::setLagMtu(const string &alias, const string &mtu) for (auto key : keys) { - auto tokens = tokenize(key, '|'); + auto tokens = tokenize(key, config_db_key_delimiter); auto lag = tokens[0]; auto member = tokens[1]; @@ -362,7 +406,7 @@ bool TeamMgr::removeLag(const string &alias) // Once a port is enslaved into a port channel, the port's MTU will // be inherited from the master's MTU while the port's admin status // will still be controlled separately. -bool TeamMgr::addLagMember(const string &lag, const string &member) +task_process_status TeamMgr::addLagMember(const string &lag, const string &member) { SWSS_LOG_ENTER(); @@ -373,7 +417,29 @@ bool TeamMgr::addLagMember(const string &lag, const string &member) // ip link set dev down; // teamdctl port add ; cmd << IP_CMD << " link set dev " << member << " down; "; - cmd << TEAMDCTL_CMD << " " << lag << " port add " << member << "; "; + cmd << TEAMDCTL_CMD << " " << lag << " port add " << member; + + if (exec(cmd.str(), res) != 0) + { + // teamdctl port add command will fail when the member port is not + // set to admin status down; it is possible that some other processes + // or users (e.g. portmgrd) are executing the command to bring up the + // member port while adding this port into the port channel. This piece + // of code will check if the port is set to admin status up. If yes, + // it will retry to add the port into the port channel. + if (checkPortIffUp(member)) + { + SWSS_LOG_INFO("Failed to add %s to port channel %s, retry...", + member.c_str(), lag.c_str()); + return task_need_retry; + } + else + { + SWSS_LOG_ERROR("Failed to add %s to port channel %s", + member.c_str(), lag.c_str()); + return task_failed; + } + } vector fvs; m_cfgPortTable.get(member, fvs); @@ -403,6 +469,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member) } // ip link set dev [up|down] + cmd.str(string()); cmd << IP_CMD << " link set dev " << member << " " << admin_status; EXEC_WITH_ERROR_THROW(cmd.str(), res); @@ -415,7 +482,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member) SWSS_LOG_NOTICE("Add %s to port channel %s", member.c_str(), lag.c_str()); - return true; + return task_success; } // Once a port is removed from from the master, both the admin status and the diff --git a/cfgmgr/teammgr.h b/cfgmgr/teammgr.h index 8e26c81a59ef..8ffb6de0e0db 100644 --- a/cfgmgr/teammgr.h +++ b/cfgmgr/teammgr.h @@ -40,12 +40,14 @@ class TeamMgr : public Orch bool addLag(const string &alias, int min_links, bool fall_back); bool removeLag(const string &alias); - bool addLagMember(const string &lag, const string &member); + task_process_status addLagMember(const string &lag, const string &member); bool removeLagMember(const string &lag, const string &member); bool setLagAdminStatus(const string &alias, const string &admin_status); bool setLagMtu(const string &alias, const string &mtu); + bool findPortMaster(string &, const string &); + bool checkPortIffUp(const string &); bool isPortStateOk(const string&); bool isLagStateOk(const string&); };