Skip to content

Commit

Permalink
Check other Prometheus servers for missing metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Oct 31, 2023
1 parent 4dbe0ab commit f6979f7
Show file tree
Hide file tree
Showing 17 changed files with 145 additions and 6 deletions.
2 changes: 2 additions & 0 deletions cmd/pint/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/cloudflare/pint/internal/config"
"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/output"
"github.com/cloudflare/pint/internal/promapi"
"github.com/cloudflare/pint/internal/reporter"
)

Expand Down Expand Up @@ -76,6 +77,7 @@ func checkRules(ctx context.Context, workers int, gen *config.PrometheusGenerato
results := make(chan reporter.Report, workers*5)
wg := sync.WaitGroup{}

ctx = context.WithValue(ctx, promapi.AllPrometheusServers, gen.Servers())
for _, s := range cfg.Check {
settings, _ := s.Decode()
key := checks.SettingsKey(s.Name)
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0037_disable_checks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=default-for lines=1-3
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","rule/duplicate(prom)","labels/conflict(prom)"] path=rules/0001.yml rule=default-for
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum-job lines=5-6
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","rule/duplicate(prom)","labels/conflict(prom)","promql/aggregate(job:true)"] path=rules/0001.yml rule=sum-job
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0039_prom_selected_path.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"]
level=DEBUG msg="Starting query workers" name=disabled uri=http://127.0.0.1:123 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=first lines=1-3
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=first
Expand All @@ -18,6 +19,7 @@ rules/0001.yml:6 Warning: job label is required and should be preserved when agg
6 | expr: sum(bar)

level=INFO msg="Problems found" Warning=1
level=DEBUG msg="Stopping query workers" name=disabled uri=http://127.0.0.1:123
-- rules/0001.yml --
- alert: first
expr: foo > 1
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0103_file_disable.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=9-10
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/vector_matching(prom)","labels/conflict(prom)","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103
-- rules/0001.yml --
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0115_file_disable_tag.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=["foo","bar"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines="6 8"
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","alerts/external_labels(prom)"] path=rules/0001.yml rule=colo:test1
level=DEBUG msg="Stopping query workers" name=prom uri=http://127.0.0.1:7103
-- rules/0001.yml --
Expand Down
8 changes: 8 additions & 0 deletions cmd/pint/tests/0144_discovery_filepath.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.ym
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 tags=["name/prom2"] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2-backup.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=2
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up
level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1.example.com
level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1-backup.example.com
level=DEBUG msg="Stopping query workers" name=prom2 uri=https://prom2.example.com
level=DEBUG msg="Stopping query workers" name=prom2 uri=https://prom2-backup.example.com
-- rules/0001.yml --
groups:
- name: foo
Expand Down
6 changes: 6 additions & 0 deletions cmd/pint/tests/0145_discovery_filepath_dup.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom2 uri=https://unique.example.com workers=16
level=INFO msg="Finding Prometheus servers using file paths" dir=servers match=^(?P<name>\w+).ya?ml$
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom1.yaml
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom1"}
Expand All @@ -21,6 +22,11 @@ level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.ym
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom2 uri=https://unique.example.com
level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1.example.com
level=DEBUG msg="Stopping query workers" name=prom1 uri=https://prom1-backup.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom2"
-- rules/0001.yml --
groups:
Expand Down
4 changes: 4 additions & 0 deletions cmd/pint/tests/0149_discovery_prom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,13 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7149
level=DEBUG msg="Added new failover URI" name=prom-ha uri=https://prom2.example.com
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom2.example.com
-- rules/0001.yml --
groups:
- name: foo
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0150_discovery_prom_dup_tags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7150
level=WARN msg="Duplicated prometheus server with different tags" name=prom-ha a=["prom2"] b=["prom1"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=["prom1"] include=[] exclude=[]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
-- rules/0001.yml --
groups:
Expand Down
4 changes: 4 additions & 0 deletions cmd/pint/tests/0152_discovery_prom_dup_uptime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,13 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7152
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
level=DEBUG msg="Generated all Prometheus servers" count=1
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=sum:up lines=4-5
level=DEBUG msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=sum:up
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom2.example.com
-- rules/0001.yml --
groups:
- name: foo
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0155_discovery_prom_dup_include.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7155
level=WARN msg="Duplicated prometheus server with different include" name=prom-ha a=["^prom2$"] b=["^prom1$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=["^prom1$"] exclude=[]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
-- rules/0001.yml --
groups:
Expand Down
2 changes: 2 additions & 0 deletions cmd/pint/tests/0156_discovery_prom_dup_exclude.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.exam
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7156
level=WARN msg="Duplicated prometheus server with different exclude" name=prom-ha a=["^prom2$"] b=["^prom1$"]
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=[] exclude=["^prom1$"]
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"
-- rules/0001.yml --
groups:
Expand Down
49 changes: 49 additions & 0 deletions cmd/pint/tests/0157_series_other_servers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
http response prometheus1 /api/v1/metadata 200 {"status":"success","data":{}}
http response prometheus1 /api/v1/status/config 200 {"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}}
http response prometheus1 /api/v1/status/flags 200 {"status":"success","data":{"storage.tsdb.retention.time": "1d"}}
http response prometheus1 /api/v1/query_range 200 {"status":"success","data":{"resultType":"matrix","result":[]}}
http response prometheus1 /api/v1/query 200 {"status":"success","data":{"resultType":"vector","result":[]}}
http start prometheus1 127.0.0.1:7157

http response prometheus2 /api/v1/metadata 200 {"status":"success","data":{}}
http response prometheus2 /api/v1/status/config 200 {"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}}
http response prometheus2 /api/v1/status/flags 200 {"status":"success","data":{"storage.tsdb.retention.time": "1d"}}
http response prometheus2 /api/v1/query_range 200 {"status":"success","data":{"resultType":"matrix","result":[]}}
http response prometheus2 /api/v1/query 200 {"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1698249632.491,"1"]}]}}
http start prometheus2 127.0.0.1:8157

pint.error --no-color lint rules
! stdout .
cmp stderr stderr.txt

-- stderr.txt --
level=INFO msg="Loading configuration file" path=.pint.hcl
level=INFO msg="Finding all rules to check" paths=["rules"]
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/1.yml$"] exclude=[]
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/2.yml$"] exclude=[]
level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=up
level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=up
rules/1.yml:5 Bug: prometheus "prom1" at http://127.0.0.1:7157 didn't have any series for "only_on_prom2" metric in the last 1w, "only_on_prom2" was found on other prometheus servers: prom2, are you deploying this rule to the correct instance? (promql/series)
5 | expr: only_on_prom2 == 0

level=INFO msg="Problems found" Bug=1
level=ERROR msg="Fatal error" err="found 1 problem(s) with severity Bug or higher"
-- rules/1.yml --
groups:
- name: foo
rules:
- alert: foo
expr: only_on_prom2 == 0
-- .pint.hcl --
prometheus "prom1" {
uri = "http://127.0.0.1:7157"
timeout = "5s"
required = true
include = [ "rules/1.yml" ]
}
prometheus "prom2" {
uri = "http://127.0.0.1:8157"
timeout = "5s"
required = true
include = [ "rules/2.yml" ]
}
8 changes: 8 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@

- Added [alerts/external_labels](checks/alerts/external_labels.md) check.

### Changed

- When [promql/series](checks/promql/series.md) finds that a time series used
by a rule is missing it will now also check other defined Prometheus servers
and add that information to the report.
This allows pint to flag rules that are most likely deployed to the wrong servers,
using missing scrape jobs.

## v0.48.2

### Fixed
Expand Down
44 changes: 42 additions & 2 deletions internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,12 @@ func (c SeriesCheck) Check(ctx context.Context, _ string, rule parser.Rule, entr
text, severity := c.textAndSeverity(
settings,
bareSelector.String(),
fmt.Sprintf("%s didn't have any series for %q metric in the last %s",
promText(c.prom.Name(), trs.URI), bareSelector.String(), sinceDesc(trs.Series.From)),
fmt.Sprintf("%s didn't have any series for %q metric in the last %s%s",
promText(c.prom.Name(), trs.URI),
bareSelector.String(),
sinceDesc(trs.Series.From),
c.checkOtherServer(ctx, selector.String()),
),
Bug,
)
problems = append(problems, Problem{
Expand Down Expand Up @@ -516,6 +520,42 @@ func (c SeriesCheck) Check(ctx context.Context, _ string, rule parser.Rule, entr
return problems
}

func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string {
var servers []*promapi.FailoverGroup
if val := ctx.Value(promapi.AllPrometheusServers); val != nil {
servers = val.([]*promapi.FailoverGroup)
}

if len(servers) == 0 {
return ""
}

presentProms := []string{}
for _, prom := range servers {
slog.Debug("Checking if metric exists on any other Prometheus server", slog.String("check", c.Reporter()), slog.String("selector", query))

qr, err := prom.Query(ctx, fmt.Sprintf("count(%s)", query))
if err != nil {
continue
}

var series int
for _, s := range qr.Series {
series += int(s.Value)
}

if series > 0 {
presentProms = append(presentProms, prom.Name())
}
}

if len(presentProms) > 0 {
return fmt.Sprintf(", %q was found on other prometheus servers: %s, are you deploying this rule to the correct instance?", query, strings.Join(presentProms, ", "))
}

return ""
}

func (c SeriesCheck) queryProblem(err error, selector string, expr parser.PromQLExpr) Problem {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
return Problem{
Expand Down
6 changes: 5 additions & 1 deletion internal/config/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ type PrometheusGenerator struct {
cfg Config
}

func (pg *PrometheusGenerator) Servers() []*promapi.FailoverGroup {
return pg.servers
}

func (pg *PrometheusGenerator) Count() int {
return len(pg.servers)
}
Expand All @@ -213,7 +217,6 @@ func (pg *PrometheusGenerator) ServersForPath(path string) []*promapi.FailoverGr
var servers []*promapi.FailoverGroup
for _, server := range pg.servers {
if server.IsEnabledForPath(path) {
server.StartWorkers(pg.metricsRegistry)
servers = append(servers, server)
}
}
Expand All @@ -235,6 +238,7 @@ func (pg *PrometheusGenerator) addServer(server *promapi.FailoverGroup) error {
slog.Any("include", server.Include()),
slog.Any("exclude", server.Exclude()),
)
server.StartWorkers(pg.metricsRegistry)
return nil
}

Expand Down
6 changes: 6 additions & 0 deletions internal/promapi/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ import (
"go.uber.org/ratelimit"
)

type PrometheusContextKey string

const (
AllPrometheusServers = PrometheusContextKey("allServers")
)

type QueryError struct {
err error
msg string
Expand Down

0 comments on commit f6979f7

Please # to comment.