From 721fca6145c98478c52b02603ee84abc809cdd2b Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Wed, 14 Dec 2022 18:19:45 +0100 Subject: [PATCH 1/3] Fixing the query behind chunks_uncompressed metric by making it rely on a function used by the maintenance jobs. It should also fix for #1741 --- pkg/pgmodel/metrics/database/metrics.go | 30 ++++++++++++------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go index aea9d053b4..3a52403b49 100644 --- a/pkg/pgmodel/metrics/database/metrics.go +++ b/pkg/pgmodel/metrics/database/metrics.go @@ -149,6 +149,12 @@ var metrics = []metricQueryWrap{ Name: "chunks_metrics_uncompressed_count", Help: "The number of metrics chunks soon to be compressed by maintenance jobs.", }, + ), + customPollConfig: updateAtMostEvery(9 * time.Minute), + query: `SELECT coalesce(sum(jsonb_array_length(chunks_to_compress)), 0)::BIGINT AS uncompressed + FROM _prom_catalog.metric_chunks_that_need_to_be_compressed(INTERVAL '1 hour');`, + }, { + metrics: gauges( prometheus.GaugeOpts{ Namespace: util.PromNamespace, Subsystem: "sql_database", @@ -157,24 +163,16 @@ var metrics = []metricQueryWrap{ }, ), customPollConfig: updateAtMostEvery(9 * time.Minute), - query: `WITH chunk_candidates AS MATERIALIZED ( - SELECT chcons.dimension_slice_id, h.table_name, h.schema_name - FROM _timescaledb_catalog.chunk_constraint chcons - INNER JOIN _timescaledb_catalog.chunk c ON c.id = chcons.chunk_id - INNER JOIN _timescaledb_catalog.hypertable h ON h.id = c.hypertable_id - WHERE c.dropped IS FALSE - AND h.compression_state = 1 -- compression_enabled = TRUE - AND (c.status & 1) != 1 -- only check for uncompressed chunks - ) - SELECT - count(*) FILTER(WHERE m.delay_compression_until IS NULL OR m.delay_compression_until < now())::BIGINT AS uncompressed, - count(*) FILTER(WHERE m.delay_compression_until IS NOT NULL AND m.delay_compression_until >= now())::BIGINT AS delayed_compression - FROM chunk_candidates cc + query: `SELECT count(*)::BIGINT AS delayed_compression + FROM _prom_catalog.metric m + INNER JOIN _timescaledb_catalog.chunk c ON (c.schema_name = m.table_schema AND c.table_name = m.table_schema) + INNER JOIN _timescaledb_catalog.chunk_constraint cc ON (cc.chunk_id = c.id) INNER JOIN _timescaledb_catalog.dimension_slice ds ON ds.id = cc.dimension_slice_id - INNER JOIN _prom_catalog.metric m ON (m.table_name = cc.table_name AND m.table_schema = cc.schema_name) WHERE NOT m.is_view - AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour') - AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`, + AND m.delay_compression_until IS NOT NULL + AND m.delay_compression_until >= now() + AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour') + AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`, }, { metrics: gauges( prometheus.GaugeOpts{ From 3ba5bda32b568c6406682bec0348b23d200c3d98 Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 15 Dec 2022 16:45:43 +0100 Subject: [PATCH 2/3] - Adding a jitter to the "slow metrics" update starting time - Adjusting alerts relying on promscale_sql_database_chunks_metrics_uncompressed_count --- CHANGELOG.md | 3 ++- docs/mixin/alerts/alerts.yaml | 8 ++++---- pkg/pgmodel/metrics/database/metrics.go | 9 ++++++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6aa3f1eef3..58af4b4e57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,8 @@ We use the following categories for changes: metrics is expected to change faster than its new collection interval [#1793] ### Fixed - +- Fixing the query behind chunks_uncompressed. The new definition should + change the baseline value [#1794] ## [0.16.0] - 2022-10-20 diff --git a/docs/mixin/alerts/alerts.yaml b/docs/mixin/alerts/alerts.yaml index 813d09bebe..6f3461a47d 100644 --- a/docs/mixin/alerts/alerts.yaml +++ b/docs/mixin/alerts/alerts.yaml @@ -274,7 +274,7 @@ groups: expr: | ( ( - min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > 10 + min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count ) and ( @@ -284,7 +284,7 @@ groups: or ( ( - min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > 10 + min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count ) and ( @@ -294,7 +294,7 @@ groups: or ( ( - min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > 10 + min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count ) and ( @@ -304,7 +304,7 @@ groups: or ( ( - min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > 10 + min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count ) and ( diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go index 3a52403b49..dff790c26f 100644 --- a/pkg/pgmodel/metrics/database/metrics.go +++ b/pkg/pgmodel/metrics/database/metrics.go @@ -2,6 +2,7 @@ package database import ( "fmt" + "math/rand" "strings" "time" @@ -47,10 +48,16 @@ type metricQueryPollConfig struct { } func updateAtMostEvery(interval time.Duration) metricQueryPollConfig { + // If we initialize lastUpdate as 0 or now - interval, then + // all the heavy queries that we aim to spread out by using this + // funciton will hammer the database simultaneously at the start. + // At the same time delaying them for the full duration of interval + // might be too much. Hence the jitter. + jitterDelta := time.Duration(rand.Int63n(int64(interval) / 3)) return metricQueryPollConfig{ enabled: true, interval: interval, - lastUpdate: time.Now(), + lastUpdate: time.Now().Add(-interval + jitterDelta), } } From 67c0a483ba217fd90b70c4850077c544d3af248a Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 15 Dec 2022 17:05:47 +0100 Subject: [PATCH 3/3] Fixing Gosec complaints --- pkg/pgmodel/metrics/database/metrics.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go index dff790c26f..87ab5f6a7e 100644 --- a/pkg/pgmodel/metrics/database/metrics.go +++ b/pkg/pgmodel/metrics/database/metrics.go @@ -1,8 +1,9 @@ package database import ( + "crypto/rand" "fmt" - "math/rand" + "math/big" "strings" "time" @@ -53,11 +54,14 @@ func updateAtMostEvery(interval time.Duration) metricQueryPollConfig { // funciton will hammer the database simultaneously at the start. // At the same time delaying them for the full duration of interval // might be too much. Hence the jitter. - jitterDelta := time.Duration(rand.Int63n(int64(interval) / 3)) + jitterDelta, err := rand.Int(rand.Reader, big.NewInt(int64(interval)/3)) + if err != nil { + panic(err) + } return metricQueryPollConfig{ enabled: true, interval: interval, - lastUpdate: time.Now().Add(-interval + jitterDelta), + lastUpdate: time.Now().Add(-interval + time.Duration(jitterDelta.Int64())), } }