From f855780870fe7511ac58fdaaa3095f62f32ef6be Mon Sep 17 00:00:00 2001 From: Taleb Zeghmi Date: Wed, 22 May 2024 13:58:36 -0700 Subject: [PATCH 1/3] AIP-8440 retry opsgenie error --- .../aip/tests/run_integration_tests.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/metaflow/plugins/aip/tests/run_integration_tests.py b/metaflow/plugins/aip/tests/run_integration_tests.py index ca0d0b08e72..f658a3188fa 100644 --- a/metaflow/plugins/aip/tests/run_integration_tests.py +++ b/metaflow/plugins/aip/tests/run_integration_tests.py @@ -168,17 +168,19 @@ def test_error_and_opsgenie_alert(pytestconfig) -> None: close_alert_endpoint: str = ( f"https://api.opsgenie.com/v2/alerts/{alert_alias}/close?identifierType=alias" ) - close_alert_response: Response = requests.post( - close_alert_endpoint, - data=json.dumps(close_alert_data), - headers=opsgenie_auth_headers, - ) - # Sometimes the response status code is 202, signaling - # the request has been accepted and is being queued for processing. - assert ( - close_alert_response.status_code == 200 - or close_alert_response.status_code == 202 - ) + + # retry 3 times with a sleep of 3s until the alert is closed + for _ in range(3): + close_alert_response: Response = requests.post( + close_alert_endpoint, + data=json.dumps(close_alert_data), + headers=opsgenie_auth_headers, + ) + if close_alert_response.status_code == 200: + break + time.sleep(3) + + assert close_alert_response.status_code == 200 # Test logging of raise_error_flow check_error_handling_flow_cmd: str = ( From 142c09c1b28b8924345e418c0b691cd18d403065 Mon Sep 17 00:00:00 2001 From: Taleb Zeghmi Date: Wed, 22 May 2024 15:19:07 -0700 Subject: [PATCH 2/3] handle close_alert_response.status_code == 202 --- metaflow/plugins/aip/tests/run_integration_tests.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/metaflow/plugins/aip/tests/run_integration_tests.py b/metaflow/plugins/aip/tests/run_integration_tests.py index f658a3188fa..3400eb832f3 100644 --- a/metaflow/plugins/aip/tests/run_integration_tests.py +++ b/metaflow/plugins/aip/tests/run_integration_tests.py @@ -176,7 +176,12 @@ def test_error_and_opsgenie_alert(pytestconfig) -> None: data=json.dumps(close_alert_data), headers=opsgenie_auth_headers, ) - if close_alert_response.status_code == 200: + # Sometimes the response status code is 202, signaling + # the request has been accepted and is being queued for processing. + if ( + close_alert_response.status_code == 200 + or close_alert_response.status_code == 202 + ): break time.sleep(3) From 38bae2e3f03b497ffbab3a51536ac5cf818d7c5c Mon Sep 17 00:00:00 2001 From: Taleb Zeghmi Date: Wed, 22 May 2024 18:24:22 -0700 Subject: [PATCH 3/3] assert fix --- .../plugins/aip/tests/run_integration_tests.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/metaflow/plugins/aip/tests/run_integration_tests.py b/metaflow/plugins/aip/tests/run_integration_tests.py index 3400eb832f3..5bf974f69db 100644 --- a/metaflow/plugins/aip/tests/run_integration_tests.py +++ b/metaflow/plugins/aip/tests/run_integration_tests.py @@ -169,6 +169,14 @@ def test_error_and_opsgenie_alert(pytestconfig) -> None: f"https://api.opsgenie.com/v2/alerts/{alert_alias}/close?identifierType=alias" ) + def is_valid_status_code(close_alert_response): + # Sometimes the response status code is 202, signaling + # the request has been accepted and is being queued for processing. + return ( + close_alert_response.status_code == 200 + or close_alert_response.status_code == 202 + ) + # retry 3 times with a sleep of 3s until the alert is closed for _ in range(3): close_alert_response: Response = requests.post( @@ -176,16 +184,11 @@ def test_error_and_opsgenie_alert(pytestconfig) -> None: data=json.dumps(close_alert_data), headers=opsgenie_auth_headers, ) - # Sometimes the response status code is 202, signaling - # the request has been accepted and is being queued for processing. - if ( - close_alert_response.status_code == 200 - or close_alert_response.status_code == 202 - ): + if is_valid_status_code(close_alert_response): break time.sleep(3) - assert close_alert_response.status_code == 200 + assert is_valid_status_code(close_alert_response) # Test logging of raise_error_flow check_error_handling_flow_cmd: str = (