From 721fca6145c98478c52b02603ee84abc809cdd2b Mon Sep 17 00:00:00 2001
From: Valery Meleshkin <valeriy@timescale.com>
Date: Wed, 14 Dec 2022 18:19:45 +0100
Subject: [PATCH 1/3] Fixing the query behind chunks_uncompressed metric by
 making it rely on a function used by the maintenance jobs.

It should also fix for #1741
---
 pkg/pgmodel/metrics/database/metrics.go | 30 ++++++++++++-------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go
index aea9d053b4..3a52403b49 100644
--- a/pkg/pgmodel/metrics/database/metrics.go
+++ b/pkg/pgmodel/metrics/database/metrics.go
@@ -149,6 +149,12 @@ var metrics = []metricQueryWrap{
 				Name:      "chunks_metrics_uncompressed_count",
 				Help:      "The number of metrics chunks soon to be compressed by maintenance jobs.",
 			},
+		),
+		customPollConfig: updateAtMostEvery(9 * time.Minute),
+		query: `SELECT coalesce(sum(jsonb_array_length(chunks_to_compress)), 0)::BIGINT AS uncompressed
+			FROM _prom_catalog.metric_chunks_that_need_to_be_compressed(INTERVAL '1 hour');`,
+	}, {
+		metrics: gauges(
 			prometheus.GaugeOpts{
 				Namespace: util.PromNamespace,
 				Subsystem: "sql_database",
@@ -157,24 +163,16 @@ var metrics = []metricQueryWrap{
 			},
 		),
 		customPollConfig: updateAtMostEvery(9 * time.Minute),
-		query: `WITH chunk_candidates AS MATERIALIZED (
-				SELECT chcons.dimension_slice_id, h.table_name, h.schema_name
-				FROM _timescaledb_catalog.chunk_constraint chcons
-					INNER JOIN _timescaledb_catalog.chunk c ON c.id = chcons.chunk_id
-					INNER JOIN _timescaledb_catalog.hypertable h ON h.id = c.hypertable_id
-				WHERE c.dropped IS FALSE
-				AND h.compression_state = 1 -- compression_enabled = TRUE
-				AND (c.status & 1) != 1 -- only check for uncompressed chunks
-			)
-			SELECT
-				count(*) FILTER(WHERE m.delay_compression_until IS NULL OR m.delay_compression_until < now())::BIGINT AS uncompressed,
-				count(*) FILTER(WHERE m.delay_compression_until IS NOT NULL AND m.delay_compression_until >= now())::BIGINT AS delayed_compression
-			FROM chunk_candidates cc
+		query: `SELECT count(*)::BIGINT AS delayed_compression
+			FROM _prom_catalog.metric m
+				INNER JOIN _timescaledb_catalog.chunk c ON (c.schema_name = m.table_schema AND c.table_name = m.table_schema)
+				INNER JOIN _timescaledb_catalog.chunk_constraint cc ON (cc.chunk_id = c.id)
 				INNER JOIN _timescaledb_catalog.dimension_slice ds ON ds.id = cc.dimension_slice_id
-				INNER JOIN _prom_catalog.metric m ON (m.table_name = cc.table_name AND m.table_schema = cc.schema_name)
 			WHERE NOT m.is_view
-			AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')
-			AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`,
+				AND m.delay_compression_until IS NOT NULL
+				AND m.delay_compression_until >= now()
+				AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')
+				AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`,
 	}, {
 		metrics: gauges(
 			prometheus.GaugeOpts{

From 3ba5bda32b568c6406682bec0348b23d200c3d98 Mon Sep 17 00:00:00 2001
From: Valery Meleshkin <valeriy@timescale.com>
Date: Thu, 15 Dec 2022 16:45:43 +0100
Subject: [PATCH 2/3] - Adding a jitter to the "slow metrics" update starting
 time - Adjusting alerts relying on
 promscale_sql_database_chunks_metrics_uncompressed_count

---
 CHANGELOG.md                            | 3 ++-
 docs/mixin/alerts/alerts.yaml           | 8 ++++----
 pkg/pgmodel/metrics/database/metrics.go | 9 ++++++++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6aa3f1eef3..58af4b4e57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,8 @@ We use the following categories for changes:
   metrics is expected to change faster than its new collection interval [#1793]
 
 ### Fixed
-
+- Fixing the query behind chunks_uncompressed. The new definition should
+  change the baseline value [#1794]
 
 ## [0.16.0] - 2022-10-20
 
diff --git a/docs/mixin/alerts/alerts.yaml b/docs/mixin/alerts/alerts.yaml
index 813d09bebe..6f3461a47d 100644
--- a/docs/mixin/alerts/alerts.yaml
+++ b/docs/mixin/alerts/alerts.yaml
@@ -274,7 +274,7 @@ groups:
     expr: |
       (
           (
-            min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > 10
+            min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count
           )
         and
           (
@@ -284,7 +284,7 @@ groups:
       or
       (
           (
-            min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > 10
+            min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count
           )
         and
           (
@@ -294,7 +294,7 @@ groups:
       or
       (
           (
-            min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > 10
+            min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count
           )
         and
           (
@@ -304,7 +304,7 @@ groups:
       or
       (
           (
-            min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > 10
+            min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count
           )
         and
           (
diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go
index 3a52403b49..dff790c26f 100644
--- a/pkg/pgmodel/metrics/database/metrics.go
+++ b/pkg/pgmodel/metrics/database/metrics.go
@@ -2,6 +2,7 @@ package database
 
 import (
 	"fmt"
+	"math/rand"
 	"strings"
 	"time"
 
@@ -47,10 +48,16 @@ type metricQueryPollConfig struct {
 }
 
 func updateAtMostEvery(interval time.Duration) metricQueryPollConfig {
+	// If we initialize lastUpdate as 0 or now - interval, then
+	// all the heavy queries that we aim to spread out by using this
+	// funciton will hammer the database simultaneously at the start.
+	// At the same time delaying them for the full duration of interval
+	// might be too much. Hence the jitter.
+	jitterDelta := time.Duration(rand.Int63n(int64(interval) / 3))
 	return metricQueryPollConfig{
 		enabled:    true,
 		interval:   interval,
-		lastUpdate: time.Now(),
+		lastUpdate: time.Now().Add(-interval + jitterDelta),
 	}
 }
 

From 67c0a483ba217fd90b70c4850077c544d3af248a Mon Sep 17 00:00:00 2001
From: Valery Meleshkin <valeriy@timescale.com>
Date: Thu, 15 Dec 2022 17:05:47 +0100
Subject: [PATCH 3/3] Fixing Gosec complaints

---
 pkg/pgmodel/metrics/database/metrics.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pkg/pgmodel/metrics/database/metrics.go b/pkg/pgmodel/metrics/database/metrics.go
index dff790c26f..87ab5f6a7e 100644
--- a/pkg/pgmodel/metrics/database/metrics.go
+++ b/pkg/pgmodel/metrics/database/metrics.go
@@ -1,8 +1,9 @@
 package database
 
 import (
+	"crypto/rand"
 	"fmt"
-	"math/rand"
+	"math/big"
 	"strings"
 	"time"
 
@@ -53,11 +54,14 @@ func updateAtMostEvery(interval time.Duration) metricQueryPollConfig {
 	// funciton will hammer the database simultaneously at the start.
 	// At the same time delaying them for the full duration of interval
 	// might be too much. Hence the jitter.
-	jitterDelta := time.Duration(rand.Int63n(int64(interval) / 3))
+	jitterDelta, err := rand.Int(rand.Reader, big.NewInt(int64(interval)/3))
+	if err != nil {
+		panic(err)
+	}
 	return metricQueryPollConfig{
 		enabled:    true,
 		interval:   interval,
-		lastUpdate: time.Now().Add(-interval + jitterDelta),
+		lastUpdate: time.Now().Add(-interval + time.Duration(jitterDelta.Int64())),
 	}
 }