Skip to content
This repository has been archived by the owner on Apr 2, 2024. It is now read-only.

A fix for chunks_uncomopressed metric #1794

Merged
merged 3 commits into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ We use the following categories for changes:
metrics is expected to change faster than its new collection interval [#1793]

### Fixed

- Fixing the query behind chunks_uncompressed. The new definition should
change the baseline value [#1794]

## [0.16.0] - 2022-10-20

Expand Down
8 changes: 4 additions & 4 deletions docs/mixin/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ groups:
expr: |
(
(
min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > 10
min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count
)
and
(
Expand All @@ -284,7 +284,7 @@ groups:
or
(
(
min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > 10
min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count
)
and
(
Expand All @@ -294,7 +294,7 @@ groups:
or
(
(
min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > 10
min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count
)
and
(
Expand All @@ -304,7 +304,7 @@ groups:
or
(
(
min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > 10
min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count
)
and
(
Expand Down
43 changes: 26 additions & 17 deletions pkg/pgmodel/metrics/database/metrics.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package database

import (
"crypto/rand"
"fmt"
"math/big"
"strings"
"time"

Expand Down Expand Up @@ -47,10 +49,19 @@ type metricQueryPollConfig struct {
}

func updateAtMostEvery(interval time.Duration) metricQueryPollConfig {
// If we initialize lastUpdate as 0 or now - interval, then
// all the heavy queries that we aim to spread out by using this
// funciton will hammer the database simultaneously at the start.
// At the same time delaying them for the full duration of interval
// might be too much. Hence the jitter.
jitterDelta, err := rand.Int(rand.Reader, big.NewInt(int64(interval)/3))
if err != nil {
panic(err)
}
return metricQueryPollConfig{
enabled: true,
interval: interval,
lastUpdate: time.Now(),
lastUpdate: time.Now().Add(-interval + time.Duration(jitterDelta.Int64())),
}
}

Expand Down Expand Up @@ -149,6 +160,12 @@ var metrics = []metricQueryWrap{
Name: "chunks_metrics_uncompressed_count",
Help: "The number of metrics chunks soon to be compressed by maintenance jobs.",
},
),
customPollConfig: updateAtMostEvery(9 * time.Minute),
query: `SELECT coalesce(sum(jsonb_array_length(chunks_to_compress)), 0)::BIGINT AS uncompressed
FROM _prom_catalog.metric_chunks_that_need_to_be_compressed(INTERVAL '1 hour');`,
}, {
metrics: gauges(
prometheus.GaugeOpts{
Namespace: util.PromNamespace,
Subsystem: "sql_database",
Expand All @@ -157,24 +174,16 @@ var metrics = []metricQueryWrap{
},
),
customPollConfig: updateAtMostEvery(9 * time.Minute),
query: `WITH chunk_candidates AS MATERIALIZED (
SELECT chcons.dimension_slice_id, h.table_name, h.schema_name
FROM _timescaledb_catalog.chunk_constraint chcons
INNER JOIN _timescaledb_catalog.chunk c ON c.id = chcons.chunk_id
INNER JOIN _timescaledb_catalog.hypertable h ON h.id = c.hypertable_id
WHERE c.dropped IS FALSE
AND h.compression_state = 1 -- compression_enabled = TRUE
AND (c.status & 1) != 1 -- only check for uncompressed chunks
)
SELECT
count(*) FILTER(WHERE m.delay_compression_until IS NULL OR m.delay_compression_until < now())::BIGINT AS uncompressed,
count(*) FILTER(WHERE m.delay_compression_until IS NOT NULL AND m.delay_compression_until >= now())::BIGINT AS delayed_compression
FROM chunk_candidates cc
query: `SELECT count(*)::BIGINT AS delayed_compression
FROM _prom_catalog.metric m
INNER JOIN _timescaledb_catalog.chunk c ON (c.schema_name = m.table_schema AND c.table_name = m.table_schema)
INNER JOIN _timescaledb_catalog.chunk_constraint cc ON (cc.chunk_id = c.id)
INNER JOIN _timescaledb_catalog.dimension_slice ds ON ds.id = cc.dimension_slice_id
INNER JOIN _prom_catalog.metric m ON (m.table_name = cc.table_name AND m.table_schema = cc.schema_name)
WHERE NOT m.is_view
AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')
AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`,
AND m.delay_compression_until IS NOT NULL
AND m.delay_compression_until >= now()
AND ds.range_start <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')
AND ds.range_end <= _timescaledb_internal.time_to_internal(now() - interval '1 hour')`,
}, {
metrics: gauges(
prometheus.GaugeOpts{
Expand Down