Compare commits

...

10 Commits

Author SHA1 Message Date
Tito Lins
b3efe99482 upgrade alerting 2025-11-28 18:46:33 -03:00
Tito Lins
d72c046df2 add log 2025-11-28 18:10:43 -03:00
Tito Lins
1229a15cfd use IsEnabledGlobally 2025-11-28 18:10:26 -03:00
Tito Lins
c833faf7ec comment MergeFlushLog for now (needs to be added to the response first) 2025-11-28 12:04:24 -03:00
Tito Lins
ea7e2d7e7d pass dispatch timer to remote AM 2025-11-28 12:04:24 -03:00
Tito Lins
ebec073d3c use dispatch timer for built-in AM 2025-11-28 12:04:24 -03:00
Tito Lins
810da084b9 add dispatch timer helper 2025-11-28 12:04:24 -03:00
Tito Lins
4808803e0d add feature flag 2025-11-28 12:04:24 -03:00
Tito Lins
b5e11cf8e9 alertmanager: add flush log to file store 2025-11-28 12:00:25 -03:00
Tito Lins
a551bb7f61 update alerting 2025-11-28 12:00:24 -03:00
15 changed files with 118 additions and 12 deletions

4
go.mod
View File

@@ -87,7 +87,7 @@ require (
github.com/googleapis/gax-go/v2 v2.15.0 // @grafana/grafana-backend-group
github.com/gorilla/mux v1.8.1 // @grafana/grafana-backend-group
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // @grafana/grafana-app-platform-squad
github.com/grafana/alerting v0.0.0-20251119204204-77fa75125181 // @grafana/alerting-backend
github.com/grafana/alerting v0.0.0-20251128214501-29edef10542b // @grafana/alerting-backend
github.com/grafana/authlib v0.0.0-20250930082137-a40e2c2b094f // @grafana/identity-access-team
github.com/grafana/authlib/types v0.0.0-20251119142549-be091cf2f4d4 // @grafana/identity-access-team
github.com/grafana/dataplane/examples v0.0.1 // @grafana/observability-metrics
@@ -697,7 +697,7 @@ require (
replace github.com/crewjam/saml => github.com/grafana/saml v0.4.15-0.20240917091248-ae3bbdad8a56
// Use our fork of the upstream Alertmanager.
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20250911094103-5456b6e45604
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20251128214251-5dfe7baf90af
exclude github.com/mattn/go-sqlite3 v2.0.3+incompatible

10
go.sum
View File

@@ -1611,8 +1611,10 @@ github.com/gorilla/sessions v1.2.1 h1:DHd3rPN5lE3Ts3D8rKkQ8x/0kqfeNmBAaiSi+o7Fsg
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
github.com/grafana/alerting v0.0.0-20251119204204-77fa75125181 h1:nbxKRtrbuhvOYmI2RhOYauHRJCtpR+vTNIgg1lFUCws=
github.com/grafana/alerting v0.0.0-20251119204204-77fa75125181/go.mod h1:VtPNIFlEOJPPEc13Ax6ZTbNV3M/sAzLID72YjgzOPVA=
github.com/grafana/alerting v0.0.0-20251119223942-7e109cc4f3e9 h1:qHY1ibKD/S5CyYi1VtUGKSq5kbgA7dTv/OYtlEMklqg=
github.com/grafana/alerting v0.0.0-20251119223942-7e109cc4f3e9/go.mod h1:VtPNIFlEOJPPEc13Ax6ZTbNV3M/sAzLID72YjgzOPVA=
github.com/grafana/alerting v0.0.0-20251128214501-29edef10542b h1:QI2xO0UVmGtkv6CuQSCaoW0uw3SV7MDsC/Mwehl75II=
github.com/grafana/alerting v0.0.0-20251128214501-29edef10542b/go.mod h1:QAWfUswv2DaC5dD45o3vXia/QDvLi17jT4u4UToeF7A=
github.com/grafana/authlib v0.0.0-20250930082137-a40e2c2b094f h1:Cbm6OKkOcJ+7CSZsGsEJzktC/SIa5bxVeYKQLuYK86o=
github.com/grafana/authlib v0.0.0-20250930082137-a40e2c2b094f/go.mod h1:axY0cdOg3q0TZHwpHnIz5x16xZ8ZBxJHShsSHHXcHQg=
github.com/grafana/authlib/types v0.0.0-20251119142549-be091cf2f4d4 h1:Muoy+FMGrHj3GdFbvsMzUT7eusgii9PKf9L1ZaXDDbY=
@@ -1665,8 +1667,8 @@ github.com/grafana/nanogit v0.3.0 h1:XNEef+4Vi+465ZITJs/g/xgnDRJbWhhJ7iQrAnWZ0oQ
github.com/grafana/nanogit v0.3.0/go.mod h1:6s6CCTpyMOHPpcUZaLGI+rgBEKdmxVbhqSGgCK13j7Y=
github.com/grafana/otel-profiling-go v0.5.1 h1:stVPKAFZSa7eGiqbYuG25VcqYksR6iWvF3YH66t4qL8=
github.com/grafana/otel-profiling-go v0.5.1/go.mod h1:ftN/t5A/4gQI19/8MoWurBEtC6gFw8Dns1sJZ9W4Tls=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20250911094103-5456b6e45604 h1:aXfUhVN/Ewfpbko2CCtL65cIiGgwStOo4lWH2b6gw2U=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20250911094103-5456b6e45604/go.mod h1:O/QP1BCm0HHIzbKvgMzqb5sSyH88rzkFk84F4TfJjBU=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20251128214251-5dfe7baf90af h1:X5p+fjoW+062S7GmTIyG3c7GcSVhCPo+/2+5Hgah6AQ=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20251128214251-5dfe7baf90af/go.mod h1:O/QP1BCm0HHIzbKvgMzqb5sSyH88rzkFk84F4TfJjBU=
github.com/grafana/pyroscope-go/godeltaprof v0.1.9 h1:c1Us8i6eSmkW+Ez05d3co8kasnuOY813tbMN8i/a3Og=
github.com/grafana/pyroscope-go/godeltaprof v0.1.9/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
github.com/grafana/pyroscope/api v1.2.1-0.20250415190842-3ff7247547ae h1:35W3Wjp9KWnSoV/DuymmyIj5aHE0CYlDQ5m2KeXUPAc=

View File

@@ -1932,6 +1932,7 @@ golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210503080704-8803ae5d1324/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210616045830-e2b7044e8c71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

View File

@@ -1201,4 +1201,8 @@ export interface FeatureToggles {
* Adds support for Kubernetes alerting historian APIs
*/
kubernetesAlertingHistorian?: boolean;
/**
* Use synchronized dispatch timer to minimize duplicate notifications across alertmanager HA pods
*/
alertingSyncDispatchTimer?: boolean;
}

View File

@@ -1984,6 +1984,14 @@ var (
Owner: grafanaAlertingSquad,
RequiresRestart: true,
},
{
Name: "alertingSyncDispatchTimer",
Description: "Use synchronized dispatch timer to minimize duplicate notifications across alertmanager HA pods",
Stage: FeatureStageExperimental,
Owner: grafanaAlertingSquad,
RequiresRestart: true,
HideFromDocs: true,
},
}
)

View File

@@ -269,3 +269,4 @@ ttlPluginInstanceManager,experimental,@grafana/plugins-platform-backend,false,fa
lokiQueryLimitsContext,experimental,@grafana/observability-logs,false,false,true
rudderstackUpgrade,experimental,@grafana/grafana-frontend-platform,false,false,true
kubernetesAlertingHistorian,experimental,@grafana/alerting-squad,false,true,false
alertingSyncDispatchTimer,experimental,@grafana/alerting-squad,false,true,false
1 Name Stage Owner requiresDevMode RequiresRestart FrontendOnly
269 lokiQueryLimitsContext experimental @grafana/observability-logs false false true
270 rudderstackUpgrade experimental @grafana/grafana-frontend-platform false false true
271 kubernetesAlertingHistorian experimental @grafana/alerting-squad false true false
272 alertingSyncDispatchTimer experimental @grafana/alerting-squad false true false

View File

@@ -765,4 +765,8 @@ const (
// FlagKubernetesAlertingHistorian
// Adds support for Kubernetes alerting historian APIs
FlagKubernetesAlertingHistorian = "kubernetesAlertingHistorian"
// FlagAlertingSyncDispatchTimer
// Use synchronized dispatch timer to minimize duplicate notifications across alertmanager HA pods
FlagAlertingSyncDispatchTimer = "alertingSyncDispatchTimer"
)

View File

@@ -498,6 +498,24 @@
"codeowner": "@grafana/alerting-squad"
}
},
{
"metadata": {
"name": "alertingSyncDispatchTimer",
"resourceVersion": "1764342211090",
"creationTimestamp": "2025-11-19T19:22:33Z",
"deletionTimestamp": "2025-11-27T17:14:53Z",
"annotations": {
"grafana.app/updatedTimestamp": "2025-11-28 15:03:31.090159 +0000 UTC"
}
},
"spec": {
"description": "Use synchronized dispatch timer to minimize duplicate notifications across alertmanager HA pods",
"stage": "experimental",
"codeowner": "@grafana/alerting-squad",
"requiresRestart": true,
"hideFromDocs": true
}
},
{
"metadata": {
"name": "alertingTriage",
@@ -3646,4 +3664,4 @@
}
}
]
}
}

View File

@@ -222,6 +222,7 @@ func (ng *AlertNG) init() error {
ExternalURL: ng.Cfg.AppURL,
SmtpConfig: smtpCfg,
Timeout: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.Timeout,
DispatchTimer: notifier.GetDispatchTimer(ng.FeatureToggles).String(),
}
autogenFn := func(ctx context.Context, logger log.Logger, orgID int64, cfg *definitions.PostableApiAlertingConfig, invalidReceiverAction notifier.InvalidReceiversAction) error {
return notifier.AddAutogenConfig(ctx, logger, ng.store, orgID, cfg, invalidReceiverAction, ng.FeatureToggles)

View File

@@ -32,6 +32,9 @@ const (
// How long we keep silences in the kvstore after they've expired.
silenceRetention = 5 * 24 * time.Hour
// How long we keep flushes in the kvstore after they've expired.
flushRetention = 5 * 24 * time.Hour
)
type AlertingStore interface {
@@ -43,8 +46,10 @@ type AlertingStore interface {
type stateStore interface {
SaveSilences(ctx context.Context, st alertingNotify.State) (int64, error)
SaveNotificationLog(ctx context.Context, st alertingNotify.State) (int64, error)
SaveFlushLog(ctx context.Context, st alertingNotify.State) (int64, error)
GetSilences(ctx context.Context) (string, error)
GetNotificationLog(ctx context.Context) (string, error)
GetFlushLog(ctx context.Context) (string, error)
}
type alertmanager struct {
@@ -99,6 +104,10 @@ func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A
if err != nil {
return nil, err
}
flushLog, err := stateStore.GetFlushLog(ctx)
if err != nil {
return nil, err
}
silencesOptions := maintenanceOptions{
initialState: silences,
@@ -121,12 +130,29 @@ func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A
}
l := log.New("ngalert.notifier")
dispatchTimer := GetDispatchTimer(featureToggles)
var flushLogOptions maintenanceOptions
if dispatchTimer == alertingNotify.DispatchTimerSync {
flushLogOptions = maintenanceOptions{
initialState: flushLog,
retention: flushRetention,
maintenanceFrequency: maintenanceInterval,
maintenanceFunc: func(state alertingNotify.State) (int64, error) {
// Detached context here is to make sure that when the service is shut down the persist operation is executed.
return stateStore.SaveFlushLog(context.Background(), state)
},
}
}
opts := alertingNotify.GrafanaAlertmanagerOpts{
ExternalURL: cfg.AppURL,
AlertStoreCallback: nil,
PeerTimeout: cfg.UnifiedAlerting.HAPeerTimeout,
Silences: silencesOptions,
Nflog: nflogOptions,
FlushLog: flushLogOptions,
DispatchTimer: dispatchTimer,
Limits: alertingNotify.Limits{
MaxSilences: cfg.UnifiedAlerting.AlertmanagerMaxSilencesCount,
MaxSilenceSizeBytes: cfg.UnifiedAlerting.AlertmanagerMaxSilenceSizeBytes,

View File

@@ -0,0 +1,19 @@
package notifier
import (
alertingNotify "github.com/grafana/alerting/notify"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/featuremgmt"
)
// GetDispatchTimer returns the appropriate dispatch timer based on feature toggles.
func GetDispatchTimer(features featuremgmt.FeatureToggles) (dt alertingNotify.DispatchTimer) {
//nolint:staticcheck // not yet migrated to OpenFeature
enabled := features.IsEnabledGlobally(featuremgmt.FlagAlertingSyncDispatchTimer)
log.New("ngalert.dispatchTimer").Info("GetDispatchTimer called", "enabled", enabled, "result", dt.String())
if enabled {
dt = alertingNotify.DispatchTimerSync
}
return
}

View File

@@ -15,6 +15,7 @@ const (
KVNamespace = "alertmanager"
NotificationLogFilename = "notifications"
SilencesFilename = "silences"
FlushLogFilename = "flushes"
)
// FileStore is in charge of persisting the alertmanager files to the database.
@@ -42,6 +43,10 @@ func (fileStore *FileStore) GetNotificationLog(ctx context.Context) (string, err
return fileStore.contentFor(ctx, NotificationLogFilename)
}
func (fileStore *FileStore) GetFlushLog(ctx context.Context) (string, error) {
return fileStore.contentFor(ctx, FlushLogFilename)
}
// contentFor returns the content for the given Alertmanager kvstore key.
func (fileStore *FileStore) contentFor(ctx context.Context, filename string) (string, error) {
// Then, let's attempt to read it from the database.
@@ -74,6 +79,11 @@ func (fileStore *FileStore) SaveNotificationLog(ctx context.Context, st alerting
return fileStore.persist(ctx, NotificationLogFilename, st)
}
// SaveFlushLog saves the flush log to the database and returns the size of the unencoded state.
func (fileStore *FileStore) SaveFlushLog(ctx context.Context, st alertingNotify.State) (int64, error) {
return fileStore.persist(ctx, FlushLogFilename, st)
}
// persist takes care of persisting the binary representation of internal state to the database as a base64 encoded string.
func (fileStore *FileStore) persist(ctx context.Context, filename string, st alertingNotify.State) (int64, error) {
var size int64

View File

@@ -5,5 +5,9 @@ func (am *alertmanager) MergeState(state ExternalState) error {
if err := am.Base.MergeNflog(state.Nflog); err != nil {
return err
}
return am.Base.MergeSilences(state.Silences)
if err := am.Base.MergeSilences(state.Silences); err != nil {
return err
}
return nil
// return am.Base.MergeFlushLog(state.FlushLog)
}

View File

@@ -84,6 +84,8 @@ type Alertmanager struct {
promoteConfig bool
externalURL string
dispatchTimer string
}
type AlertmanagerConfig struct {
@@ -109,6 +111,9 @@ type AlertmanagerConfig struct {
// Timeout for the HTTP client.
Timeout time.Duration
// DispatchTimer specifies which timer to use in the dispatcher
DispatchTimer string
}
func (cfg *AlertmanagerConfig) Validate() error {
@@ -201,6 +206,7 @@ func NewAlertmanager(ctx context.Context, cfg AlertmanagerConfig, store stateSto
externalURL: cfg.ExternalURL,
promoteConfig: cfg.PromoteConfig,
smtp: cfg.SmtpConfig,
dispatchTimer: cfg.DispatchTimer,
}
// Parse the default configuration once and remember its hash so we can compare it later.
@@ -343,10 +349,11 @@ func (am *Alertmanager) buildConfiguration(ctx context.Context, raw []byte, crea
AlertmanagerConfig: mergeResult.Config,
Templates: templates,
},
CreatedAt: createdAtEpoch,
Promoted: am.promoteConfig,
ExternalURL: am.externalURL,
SmtpConfig: am.smtp,
CreatedAt: createdAtEpoch,
Promoted: am.promoteConfig,
ExternalURL: am.externalURL,
SmtpConfig: am.smtp,
DispatchTimer: am.dispatchTimer,
}
cfgHash, err := calculateUserGrafanaConfigHash(payload)

View File

@@ -37,6 +37,7 @@ type UserGrafanaConfig struct {
Promoted bool `json:"promoted"`
ExternalURL string `json:"external_url"`
SmtpConfig SmtpConfig `json:"smtp_config"`
DispatchTimer string `json:"dispatch_timer"`
}
func (mc *Mimir) GetGrafanaAlertmanagerConfig(ctx context.Context) (*UserGrafanaConfig, error) {