Skip to content

Commit d84c896

Browse files
authored
Merge pull request #446 from Icinga/retention
Support a configurable retention policy
2 parents e326053 + e934b23 commit d84c896

10 files changed

Lines changed: 422 additions & 3 deletions

File tree

cmd/icinga-notifications/main.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ package main
22

33
import (
44
"context"
5+
"errors"
6+
"os/signal"
7+
"syscall"
8+
"time"
9+
510
"github.com/icinga/icinga-go-library/database"
611
"github.com/icinga/icinga-go-library/logging"
712
"github.com/icinga/icinga-go-library/utils"
@@ -11,10 +16,8 @@ import (
1116
"github.com/icinga/icinga-notifications/internal/daemon"
1217
"github.com/icinga/icinga-notifications/internal/incident"
1318
"github.com/icinga/icinga-notifications/internal/listener"
19+
"github.com/icinga/icinga-notifications/internal/retention"
1420
"github.com/okzk/sdnotify"
15-
"os/signal"
16-
"syscall"
17-
"time"
1821
)
1922

2023
func main() {
@@ -62,6 +65,13 @@ func main() {
6265
logger.Fatalf("Cannot load incidents from database: %+v", err)
6366
}
6467

68+
ret := retention.New(db, logs.GetChildLogger("retention"))
69+
go func() {
70+
if err := ret.Run(ctx); err != nil && !errors.Is(err, context.Canceled) {
71+
logger.Fatalf("Retention has finished with an error: %+v", err)
72+
}
73+
}()
74+
6575
// When Icinga Notifications is started by systemd, we've to notify systemd that we're ready.
6676
_ = sdnotify.Ready()
6777

config.example.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,35 @@ database:
6060
password: CHANGEME
6161
# password_file: /run/secerets/icinga_notifications_database_password
6262

63+
# The retention policy used by Icinga Notifications to determine how long no longer relevant historical data
64+
# is kept in the database before it is eligible for cleanup. Currently, this retention policy only applies to
65+
# closed incidents and related tables, but in the future it may be extended to other components as well.
66+
retention:
67+
# Retention period for historical data defined as duration string. This is the duration after which historical data
68+
# is considered old and eligible for cleanup. By default, Icinga Notifications does not automatically clean up any
69+
# data, i.e. the retention period is set to 0s. Setting this to a non-zero value enables the retention policy and
70+
# allows Icinga Notifications to automatically clean up historical data that is older than the specified duration.
71+
#
72+
# Valid units are "ms", "s", "m", "h". Defaults to "0s", which means no automatic cleanup.
73+
# period: 0s
74+
75+
# Interval for periodic cleanup of historical data defined as duration string. This is the interval at which
76+
# Icinga Notifications will run the cleanup process to prune old historical data as defined by the retention period.
77+
# Valid units are "ms", "s", "m", "h". Defaults to "1h".
78+
# interval: 1h
79+
80+
# Batch size for deleting historical data. This is the maximum number of records that will be deleted in a single
81+
# batch during the cleanup process. This is used to avoid long-running transactions and high load on the database
82+
# when deleting large amounts of historical data. Defaults to 5000.
83+
# batch_size: 5000
84+
85+
# Map of component-retention period pairs to define a different retention period than the default value for each
86+
# component. The retention period for each component defines how long historical data related to that component
87+
# is kept before it is eligible for cleanup. If a component is not listed here, the default retention period defined
88+
# above is used.
89+
# options:
90+
# incident: 480h
91+
6392
# Icinga Notifications logs its activities at various severity levels and any errors that occur either
6493
# on the console or in systemd's journal. The latter is used automatically when running under systemd.
6594
# In any case, the default log level is 'info'.
@@ -84,4 +113,5 @@ database:
84113
#database:
85114
#incident:
86115
#listener:
116+
#retention:
87117
#runtime-updates:

doc/03-Configuration.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,57 @@ For environment variables, each option is prefixed with `ICINGA_NOTIFICATIONS_DA
103103
| max_rows_per_transaction | **Optional.** Maximum number of rows Icinga Notifications is allowed to `SELECT`,`DELETE`,`UPDATE` or `INSERT` in a single transaction. Defaults to `8192`. |
104104
| wsrep_sync_wait | **Optional.** Enforce [Galera cluster](#galera-cluster) nodes to perform strict cluster-wide causality checks. Defaults to `7`. |
105105

106+
## Retention Configuration
107+
108+
You can configure the data retention and cleanup behavior of Icinga Notifications to automatically remove old data
109+
from the database after a certain period. This helps to manage the size of the database and ensure that it does not
110+
grow indefinitely.
111+
112+
For YAML configuration, the options are part of the `retention` dictionary.
113+
For environment variables, each option is prefixed with `ICINGA_NOTIFICATIONS_RETENTION_`.
114+
115+
| Option | Description |
116+
|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
117+
| period | **Optional.** Retention period defined as [duration string](#duration-string). Defaults to `0s` (disabled). Data older than this period will be automatically deleted from the database. |
118+
| interval | **Optional.** Interval for periodic retention checks defined as [duration string](#duration-string). Defaults to `1h`. |
119+
| batch_size | **Optional.** Maximum number of rows to delete in a single transaction during retention cleanup. Defaults to `5000`. |
120+
| options | **Optional.** Map of component name to retention period in order to set a different retention period for each component instead of the default one. See [retention components](#retention-components) for details. |
121+
122+
!!! info
123+
124+
By default, Icinga Notifications does not delete any data from the database, since the default retention period is
125+
`0s`, which means that all data is retained indefinitely. You can set the `period` option to a non-zero value to
126+
enable automatic data cleanup after the specified period has passed.
127+
128+
If you don't want to enable automatic data cleanup for all components, you can let the `period` option be `0s` and set
129+
non-zero retention periods for individual components in the `options` as [described below](#retention-components).
130+
This goes also for the other way around, if you want to enable automatic data cleanup for all components, you can set
131+
the `period` option to a non-zero value and set `0s` retention periods for a specific component to disable automatic
132+
data cleanup for that component.
133+
134+
### Retention Components
135+
136+
In addition to the default retention period, you can set a different retention period for each component of
137+
Icinga Notifications. This allows you to keep certain types of data for a longer or shorter period than others,
138+
depending on their importance or relevance.
139+
140+
For YAML configuration, the options are part of the `retention.options` dictionary.
141+
For environment variables, each option is prefixed with `ICINGA_NOTIFICATIONS_RETENTION_OPTIONS_` and expects
142+
a [duration string](#duration-string) as value.
143+
144+
Currently, the following components are available:
145+
146+
| Component | Description |
147+
|-----------|-----------------------------------------------------------------------|
148+
| incident | Incidents and all related data, such as their history, contacts, etc. |
149+
150+
!!! info
151+
152+
Incidents that are still open are not eligible for retention cleanup, even if they are open for months or years.
153+
The retention policy only applies to closed incidents, and the retention period is computed from the time they were
154+
closed, not from the time they were opened. If you want to clean up some incidents that are still open too, you need
155+
to close them first, and then wait for the retention period to pass.
156+
106157
## Logging Configuration
107158

108159
Configuration of the logging component used by Icinga Notifications.
@@ -151,6 +202,7 @@ ICINGA_NOTIFICATIONS_LOGGING_OPTIONS=database:error,listener:debug
151202
| database | Database connection status and queries. |
152203
| incident | Incident management and changes. |
153204
| listener | HTTP listener for event submission and debugging. |
205+
| retention | Data retention and cleanup. |
154206
| runtime-updates | Configuration changes through Icinga Notifications Web from the database. |
155207

156208
## Appendix

internal/daemon/config.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"net/url"
88
"os"
9+
"time"
910

1011
"github.com/creasty/defaults"
1112
"github.com/icinga/icinga-go-library/config"
@@ -57,6 +58,7 @@ type ConfigFile struct {
5758
Icingaweb2URL string `yaml:"icingaweb2_url" env:"ICINGAWEB2_URL"`
5859
Listener Listener `yaml:"listener" envPrefix:"LISTENER_"`
5960
Database database.Config `yaml:"database" envPrefix:"DATABASE_"`
61+
Retention Retention `yaml:"retention" envPrefix:"RETENTION_"`
6062
Logging logging.Config `yaml:"logging" envPrefix:"LOGGING_"`
6163

6264
// IcingaWeb2UrlParsed holds the parsed Icinga Web 2 URL after validation of the config file.
@@ -85,6 +87,9 @@ func (c *ConfigFile) Validate() error {
8587
if err := c.Logging.Validate(); err != nil {
8688
return err
8789
}
90+
if err := c.Retention.Validate(); err != nil {
91+
return err
92+
}
8893

8994
if c.Icingaweb2URL == "" {
9095
return errors.New("icingaweb2_url must be set")
@@ -106,10 +111,51 @@ func (c *ConfigFile) Validate() error {
106111
return nil
107112
}
108113

114+
// RetentionOpts defines additional overrides for retention periods of specific components.
115+
//
116+
// Currently, we only have a single component (incidents), but this leaves room for future expansion
117+
// without breaking the config structure. The fields here must be pointers to distinguish between
118+
// "not set" and "set to zero" (i.e. no retention) when overriding the default retention period.
119+
type RetentionOpts struct {
120+
Incident *time.Duration `yaml:"incident" env:"INCIDENT"`
121+
}
122+
123+
// Validate implements the [config.Validator] interface.
124+
func (r *RetentionOpts) Validate() error {
125+
if r.Incident != nil && *r.Incident < 0 {
126+
return errors.New("invalid retention period for incidents")
127+
}
128+
return nil
129+
}
130+
131+
// Retention defines the retention policy for Icinga Notifications history cleanups.
132+
type Retention struct {
133+
Period time.Duration `yaml:"period" env:"PERIOD"`
134+
Interval time.Duration `yaml:"interval" env:"INTERVAL" default:"1h"`
135+
BatchSize uint64 `yaml:"batch_size" env:"BATCH_SIZE" default:"5000"`
136+
Options RetentionOpts `yaml:"options" envPrefix:"OPTIONS_"`
137+
}
138+
139+
// Validate implements the [config.Validator] interface.
140+
func (r *Retention) Validate() error {
141+
if r.Period < 0 {
142+
return errors.New("invalid retention period")
143+
}
144+
if r.Interval <= 0 {
145+
return errors.New("interval must be greater than zero")
146+
}
147+
if r.BatchSize == 0 {
148+
return errors.New("'batch_size' must be greater than zero")
149+
}
150+
return r.Options.Validate()
151+
}
152+
109153
// Assert interface compliance.
110154
var (
111155
_ defaults.Setter = (*ConfigFile)(nil)
112156
_ config.Validator = (*ConfigFile)(nil)
157+
_ config.Validator = (*Listener)(nil)
158+
_ config.Validator = (*Retention)(nil)
113159
)
114160

115161
// Flags defines the CLI flags supported by Icinga Notifications.

internal/retention/prune.go

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package retention
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/icinga/icinga-go-library/backoff"
8+
"github.com/icinga/icinga-go-library/com"
9+
"github.com/icinga/icinga-go-library/database"
10+
"github.com/icinga/icinga-go-library/retry"
11+
"github.com/icinga/icinga-go-library/types"
12+
"github.com/jmoiron/sqlx"
13+
)
14+
15+
// TimeBoundPruner defines the configuration for pruning rows from a table based on a time column.
16+
//
17+
// This struct is designed to be flexible and reusable for different tables with varying time columns and
18+
// primary keys. It also supports maintaining referential integrity by allowing the definition of related
19+
// tables that reference the primary keys of the main table, ensuring that any related rows are also pruned
20+
// accordingly.
21+
type TimeBoundPruner struct {
22+
Table string
23+
PK string
24+
TimeColumn string
25+
26+
Referrers []ReferencingRowPruner
27+
}
28+
29+
// Exec prunes rows from the specified table that are older than the given time threshold.
30+
//
31+
// If Referrers are defined, it first retrieves the primary keys of the rows to be deleted and then executes the
32+
// corresponding DELETE statements for each [ReferencingRowPruner] before finally deleting the rows from the main
33+
// table. If no Referrers are defined, it directly deletes rows based on the time column.
34+
//
35+
// The method returns only the total number of deleted rows from the main table.
36+
func (tbp *TimeBoundPruner) Exec(ctx context.Context, db *database.DB, olderThan types.UnixMilli, limit uint64) (uint64, error) {
37+
deleteStmt := tbp.assembleDelete(db.DriverName(), limit, len(tbp.Referrers) > 0)
38+
if len(tbp.Referrers) == 0 {
39+
return exec(ctx, db, deleteStmt, limit, olderThan)
40+
}
41+
42+
selectStmt := tbp.assembleSelect(limit)
43+
var total uint64
44+
for {
45+
var ids []any
46+
err := retry.WithBackoff(
47+
ctx,
48+
func(ctx context.Context) error {
49+
if err := db.SelectContext(ctx, &ids, db.Rebind(selectStmt), olderThan); err != nil {
50+
return database.CantPerformQuery(err, selectStmt)
51+
}
52+
return nil
53+
},
54+
retry.Retryable,
55+
backoff.DefaultBackoff,
56+
db.GetDefaultRetrySettings(),
57+
)
58+
if err != nil {
59+
return 0, err
60+
}
61+
62+
if len(ids) == 0 {
63+
// No rows to delete, so we can skip executing the referrers and the final delete statement.
64+
return total, nil
65+
}
66+
67+
for _, referrer := range tbp.Referrers {
68+
if _, err := referrer.Exec(ctx, db, ids...); err != nil {
69+
return 0, err
70+
}
71+
}
72+
73+
if affected, err := exec(ctx, db, deleteStmt, 0, ids...); err != nil {
74+
return 0, err
75+
} else {
76+
total += affected
77+
}
78+
}
79+
}
80+
81+
// assembleSelect constructs a select stmt to retrieve primary keys of this pruner based on the time column and limit.
82+
func (tbp *TimeBoundPruner) assembleSelect(limit uint64) string {
83+
return fmt.Sprintf(`SELECT %s FROM %s WHERE %[3]s IS NOT NULL AND %[3]s < ? LIMIT %d`, tbp.PK, tbp.Table, tbp.TimeColumn, limit)
84+
}
85+
86+
// assembleDelete constructs a delete stmt for this pruner based on the database driver and whether we are
87+
// deleting by primary keys or by time column.
88+
func (tbp *TimeBoundPruner) assembleDelete(driverName string, limit uint64, byPKs bool) string {
89+
if byPKs {
90+
// The limit doesn't apply when deleting by PKs, as the number of PKs is already limited by the provided arguments.
91+
return fmt.Sprintf(`DELETE FROM %s WHERE %s IN (?)`, tbp.Table, tbp.PK)
92+
}
93+
94+
switch driverName {
95+
case database.MySQL:
96+
return fmt.Sprintf(`DELETE FROM %s WHERE %[2]s IS NOT NULL AND %[2]s < ? LIMIT %d`, tbp.Table, tbp.TimeColumn, limit)
97+
case database.PostgreSQL:
98+
return fmt.Sprintf(`
99+
WITH rows AS (SELECT %[1]s FROM %[2]s WHERE %[3]s IS NOT NULL AND %[3]s < ? LIMIT %[4]d)
100+
DELETE FROM %[2]s WHERE %[1]s IN (SELECT * FROM rows)`,
101+
tbp.PK, tbp.Table, tbp.TimeColumn, limit)
102+
default:
103+
panic(fmt.Sprintf("invalid database type %s", driverName))
104+
}
105+
}
106+
107+
// ReferencingRowPruner defines the configuration for pruning rows that reference the primary keys of another table.
108+
//
109+
// This struct is used to maintain referential integrity when pruning rows from a main table by ensuring that
110+
// any related rows in other tables are also pruned accordingly. In other words, it allows to explicitly define
111+
// cascading deletes for related tables without relying on database-level foreign key constraints.
112+
type ReferencingRowPruner struct {
113+
Table string
114+
FK string
115+
}
116+
117+
// Exec performs the pruning operation by deleting rows from the specified table that reference the given primary keys.
118+
func (rrp *ReferencingRowPruner) Exec(ctx context.Context, db *database.DB, pks ...any) (uint64, error) {
119+
return exec(ctx, db, rrp.assembleDelete(), 0, pks...)
120+
}
121+
122+
// assembleDelete constructs the delete statement for the ReferencingRowPruner based on the foreign key.
123+
func (rrp *ReferencingRowPruner) assembleDelete() string {
124+
return fmt.Sprintf(`DELETE FROM %s WHERE %s IN (?)`, rrp.Table, rrp.FK)
125+
}
126+
127+
// exec executes the given delete statement with the provided arguments and limit, handling retries and counting affected rows.
128+
func exec(ctx context.Context, db *database.DB, query string, limit uint64, args ...any) (uint64, error) {
129+
stmt, expandedArgs, err := sqlx.In(query, args)
130+
if err != nil {
131+
return 0, err
132+
}
133+
134+
var counter com.Counter
135+
defer db.Log(ctx, query, &counter).Stop()
136+
137+
for {
138+
var affected uint64
139+
err = retry.WithBackoff(
140+
ctx,
141+
func(ctx context.Context) error {
142+
res, err := db.ExecContext(ctx, db.Rebind(stmt), expandedArgs...)
143+
if err != nil {
144+
return database.CantPerformQuery(err, query)
145+
}
146+
147+
n, err := res.RowsAffected()
148+
if err == nil && n > 0 {
149+
affected = uint64(n)
150+
}
151+
return err
152+
},
153+
retry.Retryable,
154+
backoff.DefaultBackoff,
155+
db.GetDefaultRetrySettings(),
156+
)
157+
if err != nil {
158+
return 0, err
159+
}
160+
161+
counter.Add(affected)
162+
// If limit is set to 0, it means we are deleting matching rows by primary keys, so the limit check can be skipped.
163+
if limit == 0 || affected < limit {
164+
break
165+
}
166+
}
167+
168+
return counter.Total(), nil
169+
}

0 commit comments

Comments
 (0)