Skip to content

Commit

Permalink
Count the number of undeliverable IPinIP packets (#21811)
Browse files Browse the repository at this point in the history
<!--
 Please make sure you've read and understood our contributing guidelines:
 https://github.com/Azure/SONiC/blob/gh-pages/CONTRIBUTING.md

 failure_prs.log Make sure all your commits include a signature generated with `git commit -s` **

 If this is a bug fix, make sure your description includes "fixes #xxxx", or
 "closes #xxxx" or "resolves #xxxx"

 Please provide the following information:
-->

#### Why I did it
When an IPinIP packet lands on a TOR where the inner destination IP is not learned, that packet gets dropped. Keep count of the number of dropped packets so we can monitor/detect any traffic issues.

##### Work item tracking
- Microsoft ADO **24106234**:

#### How I did it
Create a new table in COUNTERS_DB to store undeliverable IPinIP packets and update it whenever tunnel packet handler sniffs such a packet.

#### How to verify it
Send an IPinIP packet to a TOR where the inner destination IP of the packet is not resolved on the TOR. Verify that the `COUNTERS:IPINIP_TUNNEL` table roughly reflects the number of packets sent. We allow some inaccuracy in the counter due to poor scapy perfomance under load as being able to track trends is more important that a 100% accurate packet count.

Link to sonic-mgmt test: sonic-net/sonic-mgmt#14128

<!--
If PR needs to be backported, then the PR must be tested against the base branch and the earliest backport release branch and provide tested image version on these two branches. For example, if the PR is requested for master, 202211 and 202012, then the requester needs to provide test results on master and 202012.
-->

#### Which release branch to backport (provide reason below if selected)

<!--
- Note we only backport fixes to a release branch, *not* features!
- Please also provide a reason for the backporting below.
- e.g.
- [x] 202006
-->

- [ ] 201811
- [ ] 201911
- [ ] 202006
- [ ] 202012
- [ ] 202106
- [ ] 202111
- [ ] 202205
- [ ] 202211
- [ ] 202305

#### Tested branch (Please provide the tested image version)

<!--
- Please provide tested image version
- e.g.
- [x] 20201231.100
-->

- [ ] <!-- image version 1 -->
- [ ] <!-- image version 2 -->

#### Description for the changelog
<!--
Write a short (one line) summary that describes the changes in this
pull request for inclusion in the changelog:
-->

<!--
 Ensure to add label/tag for the feature raised. example - PR#2174 under sonic-utilities repo. where, Generic Config and Update feature has been labelled as GCU.
-->

#### Link to config_db schema for YANG module changes
<!--
Provide a link to config_db schema for the table for which YANG model
is defined
Link should point to correct section on https://github.com/Azure/sonic-buildimage/blob/master/src/sonic-yang-models/doc/Configuration.md
-->

#### A picture of a cute animal (not mandatory but encouraged)
  • Loading branch information
mssonicbld authored Feb 21, 2025
1 parent 95e19d8 commit 8376d70
Showing 1 changed file with 34 additions and 4 deletions.
38 changes: 34 additions & 4 deletions dockers/docker-orchagent/tunnel_packet_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datetime import datetime
from ipaddress import ip_interface
from queue import Queue
from threading import Lock, Event, Thread

from swsscommon.swsscommon import ConfigDBConnector, SonicV2Connector, \
DBConnector, Select, SubscriberStateTable
Expand All @@ -29,6 +30,9 @@

STATE_DB = 'STATE_DB'
APPL_DB = 'APPL_DB'
COUNTERS_DB = 'COUNTERS_DB'
TUNNEL_PKT_COUNTER_TEMPLATE = 'COUNTERS{}IPINIP_TUNNEL_CPU_PKTS'
COUNTER_KEY = 'RX_COUNT'
PORTCHANNEL_INTERFACE_TABLE = 'PORTCHANNEL_INTERFACE'
TUNNEL_TABLE = 'TUNNEL'
PEER_SWITCH_TABLE = 'PEER_SWITCH'
Expand Down Expand Up @@ -69,13 +73,18 @@ def __init__(self):
self.config_db.connect()
self.state_db = SonicV2Connector()
self.state_db.connect(STATE_DB)
self.counters_db = SonicV2Connector()
self.counters_db.connect(COUNTERS_DB)
counters_db_separator = self.counters_db.get_db_separator(COUNTERS_DB)
self.tunnel_counter_table = TUNNEL_PKT_COUNTER_TEMPLATE.format(counters_db_separator)
self._portchannel_intfs = None
self.up_portchannels = None
self.netlink_api = IPRoute()
self.sniffer = None
self.self_ip = ''
self.packet_filter = ''
self.sniff_intfs = set()
self.pending_cmds = Queue()

global portchannel_intfs
portchannel_intfs = [name for name, _ in self.portchannel_intfs]
Expand Down Expand Up @@ -304,6 +313,27 @@ def start_sniffer(self):
while not hasattr(self.sniffer, 'stop_cb'):
time.sleep(0.1)

def write_count_to_db(self):
while True:
# use a set to automatically deduplicate destination IPs
to_run = set()

to_run.add(tuple(self.pending_cmds.get()))
pkt_count = 1
while not self.pending_cmds.empty() and len(to_run) < 100:
to_run.add(tuple(self.pending_cmds.get()))
# we should always count each packet, but only ping for each unique IP
pkt_count += 1

for cmds in to_run:
logger.log_info("Running command '{}'".format(' '.join(cmds)))
subprocess.run(cmds, stdout=subprocess.DEVNULL)
try:
curr_count = int(self.counters_db.get(COUNTERS_DB, self.tunnel_counter_table, COUNTER_KEY))
except TypeError:
curr_count = 0
self.counters_db.set(COUNTERS_DB, self.tunnel_counter_table, COUNTER_KEY, str(curr_count + pkt_count))

def ping_inner_dst(self, packet):
"""
Pings the inner destination IP for an encapsulated packet
Expand All @@ -319,8 +349,7 @@ def ping_inner_dst(self, packet):
cmds.append('-6')
dst_ip = packet[IP].payload[inner_packet_type].dst
cmds.append(dst_ip)
logger.log_info("Running command '{}'".format(' '.join(cmds)))
subprocess.run(cmds, stdout=subprocess.DEVNULL)
self.pending_cmds.put(cmds)

def listen_for_tunnel_pkts(self):
"""
Expand All @@ -339,7 +368,6 @@ def listen_for_tunnel_pkts(self):
logger.log_notice('Starting tunnel packet handler for {}'
.format(self.packet_filter))


app_db = DBConnector(APPL_DB, 0)
lag_table = SubscriberStateTable(app_db, LAG_TABLE)
sel = Select()
Expand All @@ -355,7 +383,7 @@ def listen_for_tunnel_pkts(self):
elif rc == Select.ERROR:
raise Exception("Select() error")
else:
lag, op, fvs = lag_table.pop()
lag, _, fvs = lag_table.pop()
if self.sniffer_restart_required(lag, fvs):
self.sniffer.stop()
start = datetime.now()
Expand All @@ -374,6 +402,8 @@ def run(self):
Entry point for the TunnelPacketHandler class
"""
self.wait_for_portchannels()
db_thread = Thread(target=self.write_count_to_db, daemon=True)
db_thread.start()
self.listen_for_tunnel_pkts()


Expand Down

0 comments on commit 8376d70

Please # to comment.