From 413b963f87ca8f922fed19e4206dab43427731b5 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Mon, 28 Aug 2023 10:45:14 +1000 Subject: [PATCH 01/12] Better handle failed prometheus server --- internal/checks/base.go | 8 +++++++- internal/checks/promql_counter.go | 7 ++++--- internal/checks/promql_counter_test.go | 27 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/internal/checks/base.go b/internal/checks/base.go index f9b6b195..42b6a131 100644 --- a/internal/checks/base.go +++ b/internal/checks/base.go @@ -130,7 +130,7 @@ type exprProblem struct { severity Severity } -func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) { +func getPromDescAndPerr(err error, prom string) (string, *promapi.FailoverGroupError) { promDesc := fmt.Sprintf("%q", prom) var perr *promapi.FailoverGroupError perrOk := errors.As(err, &perr) @@ -139,6 +139,12 @@ func textAndSeverityFromError(err error, reporter, prom string, s Severity) (tex promDesc = promText(prom, uri) } } + return promDesc, perr +} + +func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) { + promDesc, perr := getPromDescAndPerr(err, prom) + perrOk := perr != nil switch { case promapi.IsQueryTooExpensive(err): diff --git a/internal/checks/promql_counter.go b/internal/checks/promql_counter.go index eb52f991..c3ddedd2 100644 --- a/internal/checks/promql_counter.go +++ b/internal/checks/promql_counter.go @@ -82,12 +82,13 @@ func (c CounterCheck) checkNode(ctx context.Context, node *parser.PromQLNode, en } } else { metadata, err := c.prom.Metadata(ctx, s.Name) - if err != nil { - text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + promDesc, _ := getPromDescAndPerr(err, c.prom.Name()) + text := fmt.Sprintf("couldn't run %q checks due to missing metrics metadata. %s connection error: %s", c.Reporter(), promDesc, err) + if metadata == nil { problems = append(problems, exprProblem{ expr: s.Name, text: text, - severity: severity, + severity: Warning, }) return problems } diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 67006ace..d66d221f 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -202,7 +202,34 @@ func TestCounterCheck(t *testing.T) { }, }, }, + { + description: "500 error from Prometheus API", + content: "- record: foo\n expr: rate(foo[5m])\n", + checker: newCounterCheck, + prometheus: newSimpleProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo", + Lines: []int{2}, + Reporter: "promql/counter", + Text: checkCounterErrorUnableToRun(checks.CounterCheckName, "prom", uri, "server_error: internal error"), + Severity: checks.Warning, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireMetadataPath}, + resp: respondWithInternalError(), + }, + }, + }, } runTests(t, testCases) } + +func checkCounterErrorUnableToRun(c, name, uri, err string) string { + return fmt.Sprintf(`couldn't run %q checks due to missing metrics metadata. prometheus %q at %s connection error: %s`, c, name, uri, err) +} From c5403aae9dc32b78de18a25d9d7f47833e178583 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Mon, 28 Aug 2023 12:36:01 +1000 Subject: [PATCH 02/12] Revert unnecessary changes --- internal/checks/base.go | 8 +------- internal/checks/promql_counter.go | 6 +++--- internal/checks/promql_counter_test.go | 8 ++------ internal/promapi/failover.go | 4 ++-- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/internal/checks/base.go b/internal/checks/base.go index 42b6a131..f9b6b195 100644 --- a/internal/checks/base.go +++ b/internal/checks/base.go @@ -130,7 +130,7 @@ type exprProblem struct { severity Severity } -func getPromDescAndPerr(err error, prom string) (string, *promapi.FailoverGroupError) { +func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) { promDesc := fmt.Sprintf("%q", prom) var perr *promapi.FailoverGroupError perrOk := errors.As(err, &perr) @@ -139,12 +139,6 @@ func getPromDescAndPerr(err error, prom string) (string, *promapi.FailoverGroupE promDesc = promText(prom, uri) } } - return promDesc, perr -} - -func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) { - promDesc, perr := getPromDescAndPerr(err, prom) - perrOk := perr != nil switch { case promapi.IsQueryTooExpensive(err): diff --git a/internal/checks/promql_counter.go b/internal/checks/promql_counter.go index c3ddedd2..6b5180c1 100644 --- a/internal/checks/promql_counter.go +++ b/internal/checks/promql_counter.go @@ -82,13 +82,13 @@ func (c CounterCheck) checkNode(ctx context.Context, node *parser.PromQLNode, en } } else { metadata, err := c.prom.Metadata(ctx, s.Name) - promDesc, _ := getPromDescAndPerr(err, c.prom.Name()) - text := fmt.Sprintf("couldn't run %q checks due to missing metrics metadata. %s connection error: %s", c.Reporter(), promDesc, err) + if metadata == nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning) problems = append(problems, exprProblem{ expr: s.Name, text: text, - severity: Warning, + severity: severity, }) return problems } diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index d66d221f..06c1c30d 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -213,8 +213,8 @@ func TestCounterCheck(t *testing.T) { Fragment: "foo", Lines: []int{2}, Reporter: "promql/counter", - Text: checkCounterErrorUnableToRun(checks.CounterCheckName, "prom", uri, "server_error: internal error"), - Severity: checks.Warning, + Text: checkErrorUnableToRun(checks.CounterCheckName, "prom", uri, "server_error: internal error"), + Severity: checks.Bug, }, } }, @@ -229,7 +229,3 @@ func TestCounterCheck(t *testing.T) { runTests(t, testCases) } - -func checkCounterErrorUnableToRun(c, name, uri, err string) string { - return fmt.Sprintf(`couldn't run %q checks due to missing metrics metadata. prometheus %q at %s connection error: %s`, c, name, uri, err) -} diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 0050e853..18ceab58 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -176,10 +176,10 @@ func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata for _, prom := range fg.servers { uri = prom.safeURI metadata, err = prom.Metadata(ctx, metric) - if err == nil { + if err == nil && metadata != nil { return metadata, nil } - if !IsUnavailableError(err) { + if metadata != nil { return metadata, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} } } From c6bc7791d588c2e83f48e1277fb6da599e21a962 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Mon, 28 Aug 2023 15:12:52 +1000 Subject: [PATCH 03/12] Add more tests --- internal/checks/base_test.go | 19 +++++++ internal/checks/promql_counter_test.go | 79 ++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/internal/checks/base_test.go b/internal/checks/base_test.go index 46e1c23e..897c6577 100644 --- a/internal/checks/base_test.go +++ b/internal/checks/base_test.go @@ -76,6 +76,25 @@ func simpleProm(name, uri string, timeout time.Duration, required bool) *promapi ) } +func doubleProm(name, uri string, timeout time.Duration, required bool) *promapi.FailoverGroup { + return promapi.NewFailoverGroup( + name, + []*promapi.Prometheus{ + promapi.NewPrometheus(name, uri, map[string]string{"X-Debug": "1"}, timeout, 16, 1000, nil), + promapi.NewPrometheus(name+"2", uri, map[string]string{"X-Debug": "1"}, timeout, 16, 1000, nil), + }, + required, + "up", + nil, + nil, + nil, + ) +} + +func newDoubleProm(uri string) *promapi.FailoverGroup { + return doubleProm("prom", uri, time.Second*5, true) +} + func newSimpleProm(uri string) *promapi.FailoverGroup { return simpleProm("prom", uri, time.Second*5, true) } diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 06c1c30d..879048e3 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -11,6 +11,30 @@ import ( "github.com/cloudflare/pint/internal/promapi" ) +var ( + respondWithInternalErrorThenGoodData func(writer responseWriter) responseWriter + respondWithEmptyDataThenGoodData func(writer responseWriter) responseWriter +) + +func init() { + respondWithInternalErrorThenGoodDataState := 0 + respondWithInternalErrorThenGoodData = func(goodResponse responseWriter) responseWriter { + respondWithInternalErrorThenGoodDataState++ + if respondWithInternalErrorThenGoodDataState == 0 { + return promError{code: 500, errorType: v1.ErrServer, err: "internal error"} + } + return goodResponse + } + respondWithEmptyDataThenGoodDataState := 0 + respondWithEmptyDataThenGoodData = func(goodResponse responseWriter) responseWriter { + respondWithEmptyDataThenGoodDataState++ + if respondWithEmptyDataThenGoodDataState == 0 { + return metadataResponse{metadata: map[string][]v1.Metadata{}} + } + return goodResponse + } +} + func newCounterCheck(prom *promapi.FailoverGroup) checks.RuleChecker { return checks.NewCounterCheck(prom) } @@ -202,6 +226,19 @@ func TestCounterCheck(t *testing.T) { }, }, }, + { + description: "empty data from Prometheus API", + content: "- alert: my alert\n expr: irate(foo[5m])\n", + checker: newCounterCheck, + prometheus: newSimpleProm, + problems: noProblems, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireMetadataPath}, + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, + }, + }, + }, { description: "500 error from Prometheus API", content: "- record: foo\n expr: rate(foo[5m])\n", @@ -225,6 +262,48 @@ func TestCounterCheck(t *testing.T) { }, }, }, + { + description: "500 error from first Prometheus API - use counter with rate", + content: "- record: foo\n expr: rate(foo[5m])\n", + checker: newCounterCheck, + prometheus: newDoubleProm, + problems: noProblems, + + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireMetadataPath}, + resp: respondWithInternalErrorThenGoodData(metadataResponse{metadata: map[string][]v1.Metadata{ + "foo": {{Type: "counter"}}, + }}), + }, + }, + }, + { + description: "empty data from first Prometheus API - use counter with delta", + content: "- record: foo\n expr: delta(foo[5m])\n", + checker: newCounterCheck, + prometheus: newDoubleProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo", + Lines: []int{2}, + Reporter: "promql/counter", + Text: CounterMustUseFuncTextForRecordingRule("foo"), + Severity: checks.Warning, + }, + } + }, + + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireMetadataPath}, + resp: respondWithEmptyDataThenGoodData(metadataResponse{metadata: map[string][]v1.Metadata{ + "foo": {{Type: "counter"}}, + }}), + }, + }, + }, } runTests(t, testCases) From 506d4bd4e6b745758d4efef2e1bdbc0743747592 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Mon, 28 Aug 2023 17:47:01 +1000 Subject: [PATCH 04/12] Fix our use case --- internal/checks/promql_counter_test.go | 56 +------------------------- internal/promapi/failover.go | 18 ++++++--- 2 files changed, 14 insertions(+), 60 deletions(-) diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 879048e3..1e25cf8e 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -11,30 +11,6 @@ import ( "github.com/cloudflare/pint/internal/promapi" ) -var ( - respondWithInternalErrorThenGoodData func(writer responseWriter) responseWriter - respondWithEmptyDataThenGoodData func(writer responseWriter) responseWriter -) - -func init() { - respondWithInternalErrorThenGoodDataState := 0 - respondWithInternalErrorThenGoodData = func(goodResponse responseWriter) responseWriter { - respondWithInternalErrorThenGoodDataState++ - if respondWithInternalErrorThenGoodDataState == 0 { - return promError{code: 500, errorType: v1.ErrServer, err: "internal error"} - } - return goodResponse - } - respondWithEmptyDataThenGoodDataState := 0 - respondWithEmptyDataThenGoodData = func(goodResponse responseWriter) responseWriter { - respondWithEmptyDataThenGoodDataState++ - if respondWithEmptyDataThenGoodDataState == 0 { - return metadataResponse{metadata: map[string][]v1.Metadata{}} - } - return goodResponse - } -} - func newCounterCheck(prom *promapi.FailoverGroup) checks.RuleChecker { return checks.NewCounterCheck(prom) } @@ -262,45 +238,17 @@ func TestCounterCheck(t *testing.T) { }, }, }, - { - description: "500 error from first Prometheus API - use counter with rate", - content: "- record: foo\n expr: rate(foo[5m])\n", - checker: newCounterCheck, - prometheus: newDoubleProm, - problems: noProblems, - mocks: []*prometheusMock{ - { - conds: []requestCondition{requireMetadataPath}, - resp: respondWithInternalErrorThenGoodData(metadataResponse{metadata: map[string][]v1.Metadata{ - "foo": {{Type: "counter"}}, - }}), - }, - }, - }, { description: "empty data from first Prometheus API - use counter with delta", content: "- record: foo\n expr: delta(foo[5m])\n", checker: newCounterCheck, prometheus: newDoubleProm, - problems: func(uri string) []checks.Problem { - return []checks.Problem{ - { - Fragment: "foo", - Lines: []int{2}, - Reporter: "promql/counter", - Text: CounterMustUseFuncTextForRecordingRule("foo"), - Severity: checks.Warning, - }, - } - }, - + problems: noProblems, mocks: []*prometheusMock{ { conds: []requestCondition{requireMetadataPath}, - resp: respondWithEmptyDataThenGoodData(metadataResponse{metadata: map[string][]v1.Metadata{ - "foo": {{Type: "counter"}}, - }}), + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, }, }, }, diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 18ceab58..2cc117ba 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -176,14 +176,20 @@ func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata for _, prom := range fg.servers { uri = prom.safeURI metadata, err = prom.Metadata(ctx, metric) - if err == nil && metadata != nil { - return metadata, nil - } - if metadata != nil { - return metadata, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} + if metadata != nil && metadata.Metadata != nil { + if err == nil { + return metadata, nil + } else { + return metadata, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} + } } } - return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} + if err != nil { + return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} + } else { + return metadata, nil + } + } func (fg *FailoverGroup) Flags(ctx context.Context) (flags *FlagsResult, err error) { From b45fb3aff9fcd8e211df4f18f7a3c24427a84dfb Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Tue, 29 Aug 2023 16:21:51 +1000 Subject: [PATCH 05/12] Clean up --- internal/checks/base_test.go | 19 ------------------- internal/checks/promql_counter_test.go | 2 +- internal/promapi/failover.go | 3 ++- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/internal/checks/base_test.go b/internal/checks/base_test.go index 897c6577..46e1c23e 100644 --- a/internal/checks/base_test.go +++ b/internal/checks/base_test.go @@ -76,25 +76,6 @@ func simpleProm(name, uri string, timeout time.Duration, required bool) *promapi ) } -func doubleProm(name, uri string, timeout time.Duration, required bool) *promapi.FailoverGroup { - return promapi.NewFailoverGroup( - name, - []*promapi.Prometheus{ - promapi.NewPrometheus(name, uri, map[string]string{"X-Debug": "1"}, timeout, 16, 1000, nil), - promapi.NewPrometheus(name+"2", uri, map[string]string{"X-Debug": "1"}, timeout, 16, 1000, nil), - }, - required, - "up", - nil, - nil, - nil, - ) -} - -func newDoubleProm(uri string) *promapi.FailoverGroup { - return doubleProm("prom", uri, time.Second*5, true) -} - func newSimpleProm(uri string) *promapi.FailoverGroup { return simpleProm("prom", uri, time.Second*5, true) } diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 1e25cf8e..b0de3439 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -243,7 +243,7 @@ func TestCounterCheck(t *testing.T) { description: "empty data from first Prometheus API - use counter with delta", content: "- record: foo\n expr: delta(foo[5m])\n", checker: newCounterCheck, - prometheus: newDoubleProm, + prometheus: newSimpleProm, problems: noProblems, mocks: []*prometheusMock{ { diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 2cc117ba..4489f7a3 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -171,6 +171,8 @@ func (fg *FailoverGroup) RangeQuery(ctx context.Context, expr string, params Ran return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} } +// This function treats FailoverGroup differently to pint's original intention. Rather than assuming every server in the group is identical, we have a collection of servers each with different metrics. +// A single server failure is not catastrophic, and we care more about retrieving the metric metadata then reporting a single server error. func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata *MetadataResult, err error) { var uri string for _, prom := range fg.servers { @@ -189,7 +191,6 @@ func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata } else { return metadata, nil } - } func (fg *FailoverGroup) Flags(ctx context.Context) (flags *FlagsResult, err error) { From a8e2aa368e1ae19409f382a9f52038f117efc684 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Tue, 29 Aug 2023 16:26:26 +1000 Subject: [PATCH 06/12] cleanup --- internal/promapi/failover.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 4489f7a3..6ffc5539 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -188,9 +188,8 @@ func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata } if err != nil { return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} - } else { - return metadata, nil } + return metadata, nil } func (fg *FailoverGroup) Flags(ctx context.Context) (flags *FlagsResult, err error) { From 8a9c2840b40ad0f9378742c56a29c54fe05efff9 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Wed, 30 Aug 2023 18:27:39 +1000 Subject: [PATCH 07/12] Fix logic --- internal/checks/promql_counter.go | 13 ++++--------- internal/checks/promql_counter_test.go | 14 ++------------ internal/promapi/failover.go | 18 ++++++------------ 3 files changed, 12 insertions(+), 33 deletions(-) diff --git a/internal/checks/promql_counter.go b/internal/checks/promql_counter.go index 6b5180c1..7cdfd8fb 100644 --- a/internal/checks/promql_counter.go +++ b/internal/checks/promql_counter.go @@ -81,15 +81,10 @@ func (c CounterCheck) checkNode(ctx context.Context, node *parser.PromQLNode, en return problems } } else { - metadata, err := c.prom.Metadata(ctx, s.Name) - - if metadata == nil { - text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Warning) - problems = append(problems, exprProblem{ - expr: s.Name, - text: text, - severity: severity, - }) + metadata, _ := c.prom.Metadata(ctx, s.Name) + + // We ignore errors or missing metadata from prometheus servers. + if metadata == nil || metadata.Metadata == nil { return problems } diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index b0de3439..980ad496 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -220,17 +220,7 @@ func TestCounterCheck(t *testing.T) { content: "- record: foo\n expr: rate(foo[5m])\n", checker: newCounterCheck, prometheus: newSimpleProm, - problems: func(uri string) []checks.Problem { - return []checks.Problem{ - { - Fragment: "foo", - Lines: []int{2}, - Reporter: "promql/counter", - Text: checkErrorUnableToRun(checks.CounterCheckName, "prom", uri, "server_error: internal error"), - Severity: checks.Bug, - }, - } - }, + problems: noProblems, mocks: []*prometheusMock{ { conds: []requestCondition{requireMetadataPath}, @@ -240,7 +230,7 @@ func TestCounterCheck(t *testing.T) { }, { - description: "empty data from first Prometheus API - use counter with delta", + description: "empty data from Prometheus API", content: "- record: foo\n expr: delta(foo[5m])\n", checker: newCounterCheck, prometheus: newSimpleProm, diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 6ffc5539..0050e853 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -171,25 +171,19 @@ func (fg *FailoverGroup) RangeQuery(ctx context.Context, expr string, params Ran return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} } -// This function treats FailoverGroup differently to pint's original intention. Rather than assuming every server in the group is identical, we have a collection of servers each with different metrics. -// A single server failure is not catastrophic, and we care more about retrieving the metric metadata then reporting a single server error. func (fg *FailoverGroup) Metadata(ctx context.Context, metric string) (metadata *MetadataResult, err error) { var uri string for _, prom := range fg.servers { uri = prom.safeURI metadata, err = prom.Metadata(ctx, metric) - if metadata != nil && metadata.Metadata != nil { - if err == nil { - return metadata, nil - } else { - return metadata, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} - } + if err == nil { + return metadata, nil + } + if !IsUnavailableError(err) { + return metadata, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} } } - if err != nil { - return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} - } - return metadata, nil + return nil, &FailoverGroupError{err: err, uri: uri, isStrict: fg.strictErrors} } func (fg *FailoverGroup) Flags(ctx context.Context) (flags *FlagsResult, err error) { From 5a56056f31b0f776690e285055198392e1a07ff4 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Thu, 31 Aug 2023 10:27:46 +1000 Subject: [PATCH 08/12] Update snapshots --- cmd/pint/tests/0054_watch_metrics_prometheus.txt | 4 ---- cmd/pint/tests/0104_file_ignore_prom.txt | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cmd/pint/tests/0054_watch_metrics_prometheus.txt b/cmd/pint/tests/0054_watch_metrics_prometheus.txt index d5132ec9..8661540a 100644 --- a/cmd/pint/tests/0054_watch_metrics_prometheus.txt +++ b/cmd/pint/tests/0054_watch_metrics_prometheus.txt @@ -86,16 +86,12 @@ pint_last_run_duration_seconds pint_last_run_time_seconds # HELP pint_problem Prometheus rule problem reported by pint # TYPE pint_problem gauge -pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: error",reporter="promql/counter",severity="bug"} -pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/counter",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/series",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_data: bogus query",reporter="promql/series",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="broken",owner="",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"} -pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: error",reporter="promql/counter",severity="bug"} -pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/counter",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"} diff --git a/cmd/pint/tests/0104_file_ignore_prom.txt b/cmd/pint/tests/0104_file_ignore_prom.txt index d775f1b9..4f315c31 100644 --- a/cmd/pint/tests/0104_file_ignore_prom.txt +++ b/cmd/pint/tests/0104_file_ignore_prom.txt @@ -7,7 +7,7 @@ stderr 'level=error msg="Query returned an error" error="server error: 502" quer stderr 'level=error msg="Query returned an error" error="server error: 502" query=foo uri=http://127.0.0.1:7104' stderr 'level=error msg="Query returned an error" error="server error: 502" query=/api/v1/status/flags uri=http://127.0.0.1:7104' stderr 'level=error msg="Query returned an error" error="server error: 502" query=count\(foo\) uri=http://127.0.0.1:7104' -stderr 'level=info msg="Problems found" Bug=4' +stderr 'level=info msg="Problems found" Bug=3' -- rules/0001.yml -- # This should skip all online checks # pint file/disable promql/series From 0c074c86bfc0a38f03f36ebba451e86aa01afcb1 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Thu, 31 Aug 2023 10:58:18 +1000 Subject: [PATCH 09/12] Remove dupelicate test --- internal/checks/promql_counter_test.go | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 980ad496..453072d9 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -228,20 +228,6 @@ func TestCounterCheck(t *testing.T) { }, }, }, - - { - description: "empty data from Prometheus API", - content: "- record: foo\n expr: delta(foo[5m])\n", - checker: newCounterCheck, - prometheus: newSimpleProm, - problems: noProblems, - mocks: []*prometheusMock{ - { - conds: []requestCondition{requireMetadataPath}, - resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, - }, - }, - }, } runTests(t, testCases) From 40e8e792f2083496ac249d47df197eb5961a35f8 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Thu, 31 Aug 2023 13:59:43 +1000 Subject: [PATCH 10/12] Surface server failures --- cmd/pint/tests/0054_watch_metrics_prometheus.txt | 4 ++++ cmd/pint/tests/0104_file_ignore_prom.txt | 2 +- internal/checks/promql_counter.go | 13 +++++++++---- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cmd/pint/tests/0054_watch_metrics_prometheus.txt b/cmd/pint/tests/0054_watch_metrics_prometheus.txt index 8661540a..d5132ec9 100644 --- a/cmd/pint/tests/0054_watch_metrics_prometheus.txt +++ b/cmd/pint/tests/0054_watch_metrics_prometheus.txt @@ -86,12 +86,16 @@ pint_last_run_duration_seconds pint_last_run_time_seconds # HELP pint_problem Prometheus rule problem reported by pint # TYPE pint_problem gauge +pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: error",reporter="promql/counter",severity="bug"} +pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/counter",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="couldn't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/series",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_data: bogus query",reporter="promql/series",severity="bug"} pint_problem{filename="rules/1.yml",kind="recording",name="broken",owner="",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"} +pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: error",reporter="promql/counter",severity="bug"} +pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/counter\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/counter",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"} pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="couldn't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"} diff --git a/cmd/pint/tests/0104_file_ignore_prom.txt b/cmd/pint/tests/0104_file_ignore_prom.txt index 4f315c31..d775f1b9 100644 --- a/cmd/pint/tests/0104_file_ignore_prom.txt +++ b/cmd/pint/tests/0104_file_ignore_prom.txt @@ -7,7 +7,7 @@ stderr 'level=error msg="Query returned an error" error="server error: 502" quer stderr 'level=error msg="Query returned an error" error="server error: 502" query=foo uri=http://127.0.0.1:7104' stderr 'level=error msg="Query returned an error" error="server error: 502" query=/api/v1/status/flags uri=http://127.0.0.1:7104' stderr 'level=error msg="Query returned an error" error="server error: 502" query=count\(foo\) uri=http://127.0.0.1:7104' -stderr 'level=info msg="Problems found" Bug=3' +stderr 'level=info msg="Problems found" Bug=4' -- rules/0001.yml -- # This should skip all online checks # pint file/disable promql/series diff --git a/internal/checks/promql_counter.go b/internal/checks/promql_counter.go index 7cdfd8fb..c43620c2 100644 --- a/internal/checks/promql_counter.go +++ b/internal/checks/promql_counter.go @@ -81,10 +81,15 @@ func (c CounterCheck) checkNode(ctx context.Context, node *parser.PromQLNode, en return problems } } else { - metadata, _ := c.prom.Metadata(ctx, s.Name) - - // We ignore errors or missing metadata from prometheus servers. - if metadata == nil || metadata.Metadata == nil { + metadata, err := c.prom.Metadata(ctx, s.Name) + + if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, exprProblem{ + expr: s.Name, + text: text, + severity: severity, + }) return problems } From 4fac07367060bd23b79871cabf2e79c59674587e Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Thu, 31 Aug 2023 14:03:38 +1000 Subject: [PATCH 11/12] Fix test --- internal/checks/promql_counter_test.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/internal/checks/promql_counter_test.go b/internal/checks/promql_counter_test.go index 453072d9..5482b321 100644 --- a/internal/checks/promql_counter_test.go +++ b/internal/checks/promql_counter_test.go @@ -220,7 +220,17 @@ func TestCounterCheck(t *testing.T) { content: "- record: foo\n expr: rate(foo[5m])\n", checker: newCounterCheck, prometheus: newSimpleProm, - problems: noProblems, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo", + Lines: []int{2}, + Reporter: "promql/counter", + Text: checkErrorUnableToRun(checks.CounterCheckName, "prom", uri, "server_error: internal error"), + Severity: checks.Bug, + }, + } + }, mocks: []*prometheusMock{ { conds: []requestCondition{requireMetadataPath}, From df7c3ac14a149830c027ad41e3ba79dd0c616d80 Mon Sep 17 00:00:00 2001 From: "cary.jin" Date: Thu, 31 Aug 2023 14:06:48 +1000 Subject: [PATCH 12/12] Update promql_counter.go --- internal/checks/promql_counter.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/checks/promql_counter.go b/internal/checks/promql_counter.go index c43620c2..eb52f991 100644 --- a/internal/checks/promql_counter.go +++ b/internal/checks/promql_counter.go @@ -82,7 +82,6 @@ func (c CounterCheck) checkNode(ctx context.Context, node *parser.PromQLNode, en } } else { metadata, err := c.prom.Metadata(ctx, s.Name) - if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, exprProblem{