Skip to content
This repository was archived by the owner on May 29, 2024. It is now read-only.

Commit

Permalink
Add alert routing (#149)
Browse files Browse the repository at this point in the history
  • Loading branch information
adrain-cb authored Aug 24, 2023
1 parent d9da3ae commit 63104d3
Show file tree
Hide file tree
Showing 35 changed files with 1,555 additions and 279 deletions.
File renamed without changes.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
config.env
/.vscode/
/.idea
genesis.json
genesis.json
alert-routing.yaml
27 changes: 27 additions & 0 deletions alerts-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## This is a template for alert routing configuration.
alertRoutes:
low:
slack:
low_oncall:
url: ""
channel: ""

medium:
slack:
medium_oncall:
url: ""
channel: ""
medium_oncall:
config:
integration_key: ""

high:
slack:
high_oncall:
url: ""
channel: ""
pagerduty:
high_oncall:
integration_key: ${MY_INTEGRATION_KEY}
medium_oncall:
integration_key: ""
53 changes: 53 additions & 0 deletions docs/alert-routing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
---
layout: page
title: Alert Routing
permalink: /alert-routing
---


## Overview
The alert routing feature enables users to define a number of alert destinations and then route alerts to those
destinations based on the alert's severity. For example, a user may want to send all alerts to Slack but only send high
severity alerts to PagerDuty.


## How it works
Alerts are routed to destinations based on the severity of the given heuristic.
When a heuristic is deployed, the user must specify the severity of the alert that the heuristic will produce.
When the heuristic is run, the alert is routed to the configured destinations based on the severity of the alert.
For example, if a heuristic is configured to produce a high severity alert, the alert will be routed to all configured
destinations that support high severity alerts.

Each severity level is configured independently for each alert destination. A user can add any number of alert
configurations per severity.

Located in the root directory you'll find a file named `alerts-template.yaml`. This file contains a template for
configuring alert routing. The template contains a few examples on how you might want to configure your alert routing.

## Supported Alert Destinations
Pessimism currently supports the following alert destinations:

| Name | Description |
|-----------|-------------------------------------|
| slack | Sends alerts to a Slack channel |
| pagerduty | Sends alerts to a PagerDuty service |

## Alert Severity
Pessimism currently defines the following severities for alerts:

| Severity | Description |
|----------|-----------------------------------------------------------------------------|
| low | Alerts that may not require immediate attention |
| medium | Alerts that could be hazardous, but may not be completely destructive |
| high | Alerts that require immediate attention and could result in a loss of funds |


## PagerDuty Severity Mapping
PagerDuty supports the following severities: `critical`, `error`, `warning`, and `info`.
Pessimism maps the Pessimism severities to [PagerDuty severities](https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgx-send-an-alert-event) as follows ([ref](../internal/core/alert.go)):

| Pessimism | PagerDuty |
|-----------|-----------|
| low | warning |
| medium | error |
| high | critical |
30 changes: 30 additions & 0 deletions e2e/alert-routing-cfg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
alertRoutes:
low:
slack:
config:
url: "http://127.0.0.1:7100"
channel: "#test-low"

medium:
slack:
config:
url: "http://127.0.0.1:7100"
channel: "#test-medium"
pagerduty:
config:
integration_key: "test-medium"

high:
slack:
config:
url: "http://127.0.0.1:7100"
channel: "#test-high"
config_2:
url: "http://127.0.0.1:7100"
channel: "#test-high-2"

pagerduty:
config:
integration_key: "test-high-1"
config_2:
integration_key: "test-high-2"
42 changes: 24 additions & 18 deletions e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package e2e
import (
"context"
"errors"
"fmt"
"testing"
"time"

Expand All @@ -24,6 +25,11 @@ import (
"github.com/ethereum/go-ethereum/ethclient"
)

const (
SlackTestServerPort = 7100
PagerDutyTestPort = 7200
)

// SysTestSuite ... Stores all the information needed to run an e2e system test
type SysTestSuite struct {
t *testing.T
Expand Down Expand Up @@ -69,16 +75,18 @@ func CreateL2TestSuite(t *testing.T) *L2TestSuite {

appCfg := DefaultTestConfig()

slackServer := NewTestSlackServer()
appCfg.AlertConfig.SlackConfig.URL = slackServer.Server.URL
slackServer := NewTestSlackServer("127.0.0.1", SlackTestServerPort)

pagerdutyServer := NewTestPagerDutyServer("127.0.0.1", PagerDutyTestPort)

pagerdutyServer := NewTestPagerDutyServer()
appCfg.AlertConfig.MediumPagerDutyCfg.AlertEventsURL = pagerdutyServer.Server.URL
appCfg.AlertConfig.RoutingCfgPath = "alert-routing-cfg.yaml"
appCfg.AlertConfig.PagerdutyAlertEventsURL = fmt.Sprintf("http://127.0.0.1:%d", PagerDutyTestPort)

pess, kill, err := app.NewPessimismApp(ctx, appCfg)
if err != nil {
t.Fatal(err)
}

if err := pess.Start(); err != nil {
t.Fatal(err)
}
Expand All @@ -95,6 +103,8 @@ func CreateL2TestSuite(t *testing.T) *L2TestSuite {
Close: func() {
kill()
node.Close()
slackServer.Close()
pagerdutyServer.Close()
},
AppCfg: appCfg,
TestSlackSvr: slackServer,
Expand Down Expand Up @@ -126,16 +136,18 @@ func CreateSysTestSuite(t *testing.T) *SysTestSuite {

appCfg := DefaultTestConfig()

slackServer := NewTestSlackServer()
appCfg.AlertConfig.SlackConfig.URL = slackServer.Server.URL
slackServer := NewTestSlackServer("127.0.0.1", SlackTestServerPort)

pagerdutyServer := NewTestPagerDutyServer("127.0.0.1", PagerDutyTestPort)

pagerdutyServer := NewTestPagerDutyServer()
appCfg.AlertConfig.MediumPagerDutyCfg.AlertEventsURL = pagerdutyServer.Server.URL
appCfg.AlertConfig.RoutingCfgPath = "alert-routing-cfg.yaml"
appCfg.AlertConfig.PagerdutyAlertEventsURL = fmt.Sprintf("http://127.0.0.1:%d", PagerDutyTestPort)

pess, kill, err := app.NewPessimismApp(ctx, appCfg)
if err != nil {
t.Fatal(err)
}

if err := pess.Start(); err != nil {
t.Fatal(err)
}
Expand All @@ -152,6 +164,8 @@ func CreateSysTestSuite(t *testing.T) *SysTestSuite {
Close: func() {
kill()
sys.Close()
slackServer.Close()
pagerdutyServer.Close()
},
AppCfg: appCfg,
TestSlackSvr: slackServer,
Expand Down Expand Up @@ -185,16 +199,8 @@ func DefaultTestConfig() *config.Config {
Port: port,
},
AlertConfig: &alert.Config{
SlackConfig: &client.SlackConfig{
URL: "",
Channel: "test",
},
MediumPagerDutyCfg: &client.PagerDutyConfig{
AlertEventsURL: "",
},
HighPagerDutyCfg: &client.PagerDutyConfig{
AlertEventsURL: "",
},
PagerdutyAlertEventsURL: "",
RoutingCfgPath: "",
},
}
}
Expand Down
71 changes: 66 additions & 5 deletions e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func Test_Balance_Enforcement(t *testing.T) {
StartHeight: nil,
EndHeight: nil,
AlertingParams: &core.AlertPolicy{
Sev: core.HIGH.String(),
Sev: core.MEDIUM.String(),
Msg: alertMsg,
},
SessionParams: map[string]interface{}{
Expand Down Expand Up @@ -97,8 +97,8 @@ func Test_Balance_Enforcement(t *testing.T) {
// Check that the balance enforcement was triggered using the mocked server cache.
posts := ts.TestPagerDutyServer.PagerDutyAlerts()
slackPosts := ts.TestSlackSvr.SlackAlerts()
assert.Greater(t, len(slackPosts), 0, "No balance enforcement alert was sent")
assert.Greater(t, len(posts), 0, "No balance enforcement alert was sent")
assert.Greater(t, len(slackPosts), 1, "No balance enforcement alert was sent")
assert.Greater(t, len(posts), 1, "No balance enforcement alert was sent")
assert.Contains(t, posts[0].Payload.Summary, "balance_enforcement", "Balance enforcement alert was not sent")

// Get Bobs's balance.
Expand Down Expand Up @@ -155,7 +155,7 @@ func Test_Balance_Enforce_With_CoolDown(t *testing.T) {
AlertingParams: &core.AlertPolicy{
// Set a cooldown of one minute.
CoolDown: 60,
Dest: core.Slack.String(),
Sev: core.LOW.String(),
Msg: alertMsg,
},
SessionParams: map[string]interface{}{
Expand Down Expand Up @@ -202,7 +202,7 @@ func Test_Balance_Enforce_With_CoolDown(t *testing.T) {
// Check that the balance enforcement was triggered using the mocked server cache.
posts := ts.TestSlackSvr.SlackAlerts()

assert.Equal(t, len(posts), 1, "No balance enforcement alert was sent")
assert.Equal(t, 1, len(posts), "No balance enforcement alert was sent")
assert.Contains(t, posts[0].Text, "balance_enforcement", "Balance enforcement alert was not sent")
assert.Contains(t, posts[0].Text, alertMsg)

Expand Down Expand Up @@ -275,6 +275,67 @@ func Test_Contract_Event(t *testing.T) {
assert.Contains(t, posts[0].Text, alertMsg, "System contract event message was not propagated")
}

// Test_Contract_Event_High_Priority ... Tests the E2E flow of a contract event heuristic with high priority alerts all
// necessary destinations
func Test_Contract_Event_High_Priority(t *testing.T) {

ts := e2e.CreateSysTestSuite(t)
defer ts.Close()

l1Client := ts.Sys.Clients["l1"]

updateSig := "ConfigUpdate(uint256,uint8,bytes)"
alertMsg := "System config gas config updated"

err := ts.App.BootStrap([]*models.SessionRequestParams{{
Network: core.Layer1.String(),
PType: core.Live.String(),
HeuristicType: core.ContractEvent.String(),
StartHeight: nil,
EndHeight: nil,
AlertingParams: &core.AlertPolicy{
Msg: alertMsg,
Sev: core.HIGH.String(),
},
SessionParams: map[string]interface{}{
"address": predeploys.DevSystemConfigAddr.String(),
"args": []interface{}{updateSig},
},
}})

assert.NoError(t, err, "Error bootstrapping heuristic session")

sysCfg, err := bindings.NewSystemConfig(predeploys.DevSystemConfigAddr, l1Client)
assert.NoError(t, err, "Error getting system config")

opts, err := bind.NewKeyedTransactorWithChainID(ts.Cfg.Secrets.SysCfgOwner, ts.Cfg.L1ChainIDBig())
assert.NoError(t, err, "Error getting system config owner pk")

overhead := big.NewInt(10000)
scalar := big.NewInt(1)

tx, err := sysCfg.SetGasConfig(opts, overhead, scalar)
assert.NoError(t, err, "Error setting gas config")

txTimeoutDuration := 10 * time.Duration(ts.Cfg.DeployConfig.L1BlockTime) * time.Second
receipt, err := e2e.WaitForTransaction(tx.Hash(), l1Client, txTimeoutDuration)

assert.NoError(t, err, "Error waiting for transaction")
assert.Equal(t, receipt.Status, types.ReceiptStatusSuccessful, "transaction failed")

time.Sleep(3 * time.Second)
slackPosts := ts.TestSlackSvr.SlackAlerts()
pdPosts := ts.TestPagerDutyServer.PagerDutyAlerts()

// Expect 2 alerts to each destination as alert-routing-cfg.yaml has two slack and two pagerduty destinations
assert.Equal(t, 2, len(slackPosts), "Incorrect Number of slack posts sent")
assert.Equal(t, 2, len(pdPosts), "Incorrect Number of pagerduty posts sent")
assert.Contains(t, slackPosts[0].Text, "contract_event", "System contract event alert was not sent")
assert.Contains(t, slackPosts[1].Text, "contract_event", "System contract event alert was not sent")
assert.Contains(t, pdPosts[0].Payload.Summary, "contract_event", "System contract event alert was not sent")
assert.Contains(t, pdPosts[1].Payload.Summary, "contract_event", "System contract event alert was not sent")
}

// TestAccount defines an account for testing.
type TestAccount struct {
Key *ecdsa.PrivateKey
Expand Down
30 changes: 23 additions & 7 deletions e2e/test_pagerduty_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package e2e

import (
"encoding/json"
"fmt"
"net"
"net/http"
"net/http/httptest"
"strings"
Expand All @@ -19,21 +21,35 @@ type TestPagerDutyServer struct {
}

// NewTestPagerDutyServer ... Creates a new mock pagerduty server
func NewTestPagerDutyServer() *TestPagerDutyServer {
ts := &TestPagerDutyServer{
func NewTestPagerDutyServer(url string, port int) *TestPagerDutyServer { //nolint:dupl //This will be addressed
l, err := net.Listen("tcp", fmt.Sprintf("%s:%d", url, port))
if err != nil {
panic(err)
}

pds := &TestPagerDutyServer{
Payloads: []*client.PagerDutyRequest{},
}

ts.Server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
pds.Server = httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch strings.TrimSpace(r.URL.Path) {
case "/":
ts.mockPagerDutyPost(w, r)
pds.mockPagerDutyPost(w, r)
default:
http.NotFoundHandler().ServeHTTP(w, r)
}
}))

return ts
err = pds.Server.Listener.Close()
if err != nil {
panic(err)
}
pds.Server.Listener = l
pds.Server.Start()

logging.NoContext().Info("Test pagerduty server started", zap.String("url", url), zap.Int("port", port))

return pds
}

// Close ... Closes the server
Expand All @@ -47,14 +63,14 @@ func (svr *TestPagerDutyServer) mockPagerDutyPost(w http.ResponseWriter, r *http

if err := json.NewDecoder(r.Body).Decode(&alert); err != nil {
w.WriteHeader(http.StatusBadRequest)
_, _ = w.Write([]byte(`{"status":false, "message":"could not decode pagerduty payload"}`))
_, _ = w.Write([]byte(`{"status":"failure"", "message":"could not decode pagerduty payload"}`))
return
}

svr.Payloads = append(svr.Payloads, alert)

w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":success, "message":""}`))
_, _ = w.Write([]byte(`{"status":"success", "message":""}`))
}

// PagerDutyAlerts ... Returns the pagerduty alerts
Expand Down
Loading

0 comments on commit 63104d3

Please # to comment.