From 0e6bc9cc28662e7392b94700543c69f755b19d0d Mon Sep 17 00:00:00 2001 From: Lawrence Lee Date: Thu, 22 Jun 2023 09:14:10 +0000 Subject: [PATCH 1/3] [arp_update]: Fix IPv6 neighbor race condition Signed-off-by: Lawrence Lee --- files/scripts/arp_update | 89 +++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 24 deletions(-) diff --git a/files/scripts/arp_update b/files/scripts/arp_update index 4b25973cfc02..0986a011a8ca 100755 --- a/files/scripts/arp_update +++ b/files/scripts/arp_update @@ -89,32 +89,73 @@ while /bin/true; do eval `eval $ip6cmd` if [[ $SUBTYPE == "dualtor" ]]; then - # manually set any remaining FAILED/INCOMPLETE entries to permanently INCOMPLETE - # this prevents any remaining INCOMPLETE entries from automatically transitioning to FAILED - # once these entries are incomplete, any subsequent neighbor advertisement messages - # are able to resolve the entry + # capture all current failed/incomplete IPv6 neighbors in the kernel to avoid situations where new neighbors are learned + # in the middle of the below sequence of commands + unresolved_kernel_neighbors=$(ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE') + failed_kernel_neighbors=$(echo "$unresolved_kernel_neighbors" | grep FAILED | cut -d ' ' -f 1) + + # it's possible for kernel neighbors to fall out of sync with the hardware + # this can result in failed neighbors entries that don't have corresponding zero MAC neighbor entries + # and therefore don't have tunnel routes installed in the hardware + # flush these neighbors from the kernel to force relearning and resync them to the hardware: + # 1. for every FAILED or INCOMPLETE neighbor in the kernel, check if there is a corresponding zero MAC neighbor in APPL_DB + # 2. if no zero MAC neighbor entry exists, flush the kernel neighbor entry + # - generates the command 'ip neigh flush ' for all such neighbors + #ip_neigh_flush_cmd="echo \"$unresolved_kernel_neighbors\" | cut -d ' ' -f 1 | xargs -I{} bash -c \"if [[ -z \\\"\\\$(sonic-db-cli APPL_DB hget NEIGH_TABLE:$vlan:{} neigh)\\\" ]]; then echo 'ip neigh flush {};'; fi\"" + unsync_neighbors=$(echo "$unresolved_kernel_neighbors" | cut -d ' ' -f 1 | xargs -I{} bash -c "if [[ -z \"\$(sonic-db-cli APPL_DB hget NEIGH_TABLE:$vlan:{} neigh)\" ]]; then echo '{}'; fi") + + if [[ ! -z "$unsync_neighbors" ]]; then + ip_neigh_flush_cmd="echo \"$unsync_neighbors\" | sed -e 's/^/ip neigh flush /' -e 's/$/;/'" + echo `eval "$ip_neigh_flush_cmd"` + eval `eval "$ip_neigh_flush_cmd"` + fi + + # generates the following command for each FAILED or INCOMPLETE IPv6 neighbor + # timeout 0.2 ping -n -q -i 0 -c 1 -W 1 -I >/dev/null - # generates the following command for each failed or incomplete IPv6 neighbor + if [[ ! -z "$unresolved_kernel_neighbors" ]]; then + ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'" + failed_ip6_neigh_cmd="echo \"$unresolved_kernel_neighbors\" | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template" + echo `eval "$failed_ip6_neigh_cmd"` + eval `eval "$failed_ip6_neigh_cmd"` + fi + + # wait for any transient INCOMPLETE neighbors to transition to FAILED + # once these transition to FAILED, we can guarantee that the kernel has generated a netlink message for the neighbor + # and proceed setting it to permanent INCOMPLETE + # this is necessary for: + # 1. unsyncronized neighbors that were flushed, then pinged + # 2. existing FAILED neighbors which will be set to transient INCOMPLETE by the above ping + wait_neighbors="$unsync_neighbors"$'\n'"$failed_kernel_neighbors" + echo "wait neighbors: $wait_neighbors" + if [[ ! -z "$wait_neighbors" ]]; then + trans_incomplete_check=$(echo "$wait_neighbors" | sed -e 's/^/ip neigh show /' -e 's/$/ | grep -q 'INCOMPLETE' \&\&/') + trans_incomplete_check=$(echo $trans_incomplete_check | sed -e 's/\&\&$//') + echo "$trans_incomplete_check" + timeout 10 bash -c "while eval \"$trans_incomplete_check\"; do echo 'waiting'; sleep 1; done" + fi + + # some pings may create transient INCOMPLETE neighbor entries in the kernel + # wait for these to transition to FAILED befor setting them to permanently INCOMPLETE + # mcast_solicit=$(cat /proc/sys/net/ipv6/neigh/$vlan/mcast_solicit) + # retrans_time_ms=$(cat /proc/sys/net/ipv6/neigh/$vlan/retrans_time_ms) + # (( transition_delay=(((mcast_solicit * retrans_time_ms) + 999) / 1000) + 1 )) # equivalent to ceiling(mcast_solicit * retrans_time_ms / 1000) + 1 + # sleep $transition_delay + + # manually set any remaining FAILED entries to permanently INCOMPLETE + # once these entries are INCOMPLETE, any subsequent neighbor advertisement messages are able to resolve the entry + # ignore INCOMPLETE neighbors since if they are transiently incomplete (i.e. new kernel neighbors that we are attempting to resolve for the first time), + # setting them to permanently incomplete here means the kernel will never generate a netlink message for that neighbor + # generates the following command for each FAILED IPv6 neighbor # ip neigh replace dev nud incomplete - neigh_replace_template="sed -e 's/^/ip neigh replace /' -e 's/,/ dev /' -e 's/$/ nud incomplete;/'" - ip_neigh_replace_cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE' | cut -d ' ' -f 1,3 --output-delimiter=',' | $neigh_replace_template" - eval `eval $ip_neigh_replace_cmd` - - # on dual ToR devices, try to resolve failed neighbor entries since - # these entries will have tunnel routes installed, preventing normal - # neighbor resolution (SWSS PR #2137) - - # since ndisc6 is a userland process, the above ndisc6 commands are - # insufficient to update the kernel neighbor table for failed entries - - # we don't need to do this for ipv4 neighbors since arping is able to - # update the kernel neighbor table - - # generates the following command for each failed or incomplete IPv6 neighbor - # timeout 0.2 ping -n -q -i 0 -c 1 -W 1 -I >/dev/null - ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'" - failed_ip6_neigh_cmd="ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED|INCOMPLETE' | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template" - eval `eval $failed_ip6_neigh_cmd` + failed_kernel_neighbors=$(ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED') + + if [[ ! -z "$failed_kernel_neighbors" ]]; then + neigh_replace_template="sed -e 's/^/ip neigh replace /' -e 's/,/ dev /' -e 's/$/ nud incomplete;/'" + ip_neigh_replace_cmd="echo \"$failed_kernel_neighbors\" | cut -d ' ' -f 1,3 --output-delimiter=',' | $neigh_replace_template" + echo `eval "$ip_neigh_replace_cmd"` + eval `eval "$ip_neigh_replace_cmd"` + fi fi done From f00e4ade122054c087acc49d37f4aebac97081ee Mon Sep 17 00:00:00 2001 From: Lawrence Lee Date: Thu, 22 Jun 2023 09:26:46 +0000 Subject: [PATCH 2/3] cleanup code Signed-off-by: Lawrence Lee --- files/scripts/arp_update | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/files/scripts/arp_update b/files/scripts/arp_update index 0986a011a8ca..2a0f4ee73eaa 100755 --- a/files/scripts/arp_update +++ b/files/scripts/arp_update @@ -101,22 +101,17 @@ while /bin/true; do # 1. for every FAILED or INCOMPLETE neighbor in the kernel, check if there is a corresponding zero MAC neighbor in APPL_DB # 2. if no zero MAC neighbor entry exists, flush the kernel neighbor entry # - generates the command 'ip neigh flush ' for all such neighbors - #ip_neigh_flush_cmd="echo \"$unresolved_kernel_neighbors\" | cut -d ' ' -f 1 | xargs -I{} bash -c \"if [[ -z \\\"\\\$(sonic-db-cli APPL_DB hget NEIGH_TABLE:$vlan:{} neigh)\\\" ]]; then echo 'ip neigh flush {};'; fi\"" unsync_neighbors=$(echo "$unresolved_kernel_neighbors" | cut -d ' ' -f 1 | xargs -I{} bash -c "if [[ -z \"\$(sonic-db-cli APPL_DB hget NEIGH_TABLE:$vlan:{} neigh)\" ]]; then echo '{}'; fi") - if [[ ! -z "$unsync_neighbors" ]]; then ip_neigh_flush_cmd="echo \"$unsync_neighbors\" | sed -e 's/^/ip neigh flush /' -e 's/$/;/'" - echo `eval "$ip_neigh_flush_cmd"` eval `eval "$ip_neigh_flush_cmd"` fi # generates the following command for each FAILED or INCOMPLETE IPv6 neighbor # timeout 0.2 ping -n -q -i 0 -c 1 -W 1 -I >/dev/null - if [[ ! -z "$unresolved_kernel_neighbors" ]]; then ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'" failed_ip6_neigh_cmd="echo \"$unresolved_kernel_neighbors\" | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template" - echo `eval "$failed_ip6_neigh_cmd"` eval `eval "$failed_ip6_neigh_cmd"` fi @@ -127,21 +122,12 @@ while /bin/true; do # 1. unsyncronized neighbors that were flushed, then pinged # 2. existing FAILED neighbors which will be set to transient INCOMPLETE by the above ping wait_neighbors="$unsync_neighbors"$'\n'"$failed_kernel_neighbors" - echo "wait neighbors: $wait_neighbors" if [[ ! -z "$wait_neighbors" ]]; then trans_incomplete_check=$(echo "$wait_neighbors" | sed -e 's/^/ip neigh show /' -e 's/$/ | grep -q 'INCOMPLETE' \&\&/') trans_incomplete_check=$(echo $trans_incomplete_check | sed -e 's/\&\&$//') - echo "$trans_incomplete_check" timeout 10 bash -c "while eval \"$trans_incomplete_check\"; do echo 'waiting'; sleep 1; done" fi - # some pings may create transient INCOMPLETE neighbor entries in the kernel - # wait for these to transition to FAILED befor setting them to permanently INCOMPLETE - # mcast_solicit=$(cat /proc/sys/net/ipv6/neigh/$vlan/mcast_solicit) - # retrans_time_ms=$(cat /proc/sys/net/ipv6/neigh/$vlan/retrans_time_ms) - # (( transition_delay=(((mcast_solicit * retrans_time_ms) + 999) / 1000) + 1 )) # equivalent to ceiling(mcast_solicit * retrans_time_ms / 1000) + 1 - # sleep $transition_delay - # manually set any remaining FAILED entries to permanently INCOMPLETE # once these entries are INCOMPLETE, any subsequent neighbor advertisement messages are able to resolve the entry # ignore INCOMPLETE neighbors since if they are transiently incomplete (i.e. new kernel neighbors that we are attempting to resolve for the first time), @@ -149,11 +135,9 @@ while /bin/true; do # generates the following command for each FAILED IPv6 neighbor # ip neigh replace dev nud incomplete failed_kernel_neighbors=$(ip -6 neigh show | grep -v fe80 | grep $vlan | grep -E 'FAILED') - if [[ ! -z "$failed_kernel_neighbors" ]]; then neigh_replace_template="sed -e 's/^/ip neigh replace /' -e 's/,/ dev /' -e 's/$/ nud incomplete;/'" ip_neigh_replace_cmd="echo \"$failed_kernel_neighbors\" | cut -d ' ' -f 1,3 --output-delimiter=',' | $neigh_replace_template" - echo `eval "$ip_neigh_replace_cmd"` eval `eval "$ip_neigh_replace_cmd"` fi fi From cdd203b9b3f7cd59e627a53886ada648c0bef9f1 Mon Sep 17 00:00:00 2001 From: Lawrence Lee Date: Wed, 28 Jun 2023 21:49:38 +0000 Subject: [PATCH 3/3] Add sleeps Signed-off-by: Lawrence Lee --- files/scripts/arp_update | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/files/scripts/arp_update b/files/scripts/arp_update index 2a0f4ee73eaa..f267e05a54cc 100755 --- a/files/scripts/arp_update +++ b/files/scripts/arp_update @@ -105,6 +105,7 @@ while /bin/true; do if [[ ! -z "$unsync_neighbors" ]]; then ip_neigh_flush_cmd="echo \"$unsync_neighbors\" | sed -e 's/^/ip neigh flush /' -e 's/$/;/'" eval `eval "$ip_neigh_flush_cmd"` + sleep 2 fi # generates the following command for each FAILED or INCOMPLETE IPv6 neighbor @@ -113,19 +114,8 @@ while /bin/true; do ping6_template="sed -e 's/^/timeout 0.2 ping /' -e 's/,/ -n -q -i 0 -c 1 -W 1 -I /' -e 's/$/ >\/dev\/null;/'" failed_ip6_neigh_cmd="echo \"$unresolved_kernel_neighbors\" | cut -d ' ' -f 1,3 --output-delimiter=',' | $ping6_template" eval `eval "$failed_ip6_neigh_cmd"` - fi - - # wait for any transient INCOMPLETE neighbors to transition to FAILED - # once these transition to FAILED, we can guarantee that the kernel has generated a netlink message for the neighbor - # and proceed setting it to permanent INCOMPLETE - # this is necessary for: - # 1. unsyncronized neighbors that were flushed, then pinged - # 2. existing FAILED neighbors which will be set to transient INCOMPLETE by the above ping - wait_neighbors="$unsync_neighbors"$'\n'"$failed_kernel_neighbors" - if [[ ! -z "$wait_neighbors" ]]; then - trans_incomplete_check=$(echo "$wait_neighbors" | sed -e 's/^/ip neigh show /' -e 's/$/ | grep -q 'INCOMPLETE' \&\&/') - trans_incomplete_check=$(echo $trans_incomplete_check | sed -e 's/\&\&$//') - timeout 10 bash -c "while eval \"$trans_incomplete_check\"; do echo 'waiting'; sleep 1; done" + # allow some time for any transient INCOMPLETE neighbors to transition to FAILED + sleep 5 fi # manually set any remaining FAILED entries to permanently INCOMPLETE