feat(monitoring): add alert evaluator with cooldown and multi-channel dispatch

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Martin Porwoll 2026-02-15 00:30:21 +00:00
parent dc14c5dbc7
commit 97c8f32967
2 changed files with 285 additions and 0 deletions

View file

@ -0,0 +1,185 @@
/**
* Alert Evaluator
*
* Evaluates monitoring alert rules against current system metrics.
* Supports dot-notation metric paths, configurable conditions,
* cooldown periods, and multi-channel alert dispatch.
*/
import type { Payload } from 'payload'
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js'
// ============================================================================
// Pure Functions
// ============================================================================
/**
* Resolves a dot-notation metric path against a metrics object.
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
*/
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
const parts = path.split('.')
let current: unknown = metrics
for (const part of parts) {
if (current === null || current === undefined || typeof current !== 'object') {
return undefined
}
current = (current as Record<string, unknown>)[part]
}
return typeof current === 'number' ? current : undefined
}
/**
* Evaluates a condition against a value and threshold.
*/
export function evaluateCondition(
condition: AlertCondition,
value: number,
threshold: number,
): boolean {
switch (condition) {
case 'gt':
return value > threshold
case 'lt':
return value < threshold
case 'eq':
return value === threshold
case 'gte':
return value >= threshold
case 'lte':
return value <= threshold
default:
return false
}
}
// ============================================================================
// Types
// ============================================================================
interface AlertRule {
id: string
name: string
metric: string
condition: AlertCondition
threshold: number
severity: AlertSeverity
channels: string[]
recipients?: {
emails?: Array<{ email: string }>
slackWebhook?: string
discordWebhook?: string
}
cooldownMinutes: number
enabled: boolean
}
// Maps AlertSeverity to the AlertLevel expected by alert-service
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
warning: 'warning',
error: 'error',
critical: 'critical',
}
// ============================================================================
// AlertEvaluator Class
// ============================================================================
export class AlertEvaluator {
/** Tracks last fire time per rule to enforce cooldown */
private cooldownMap: Map<string, number> = new Map()
/**
* Returns true if the rule should fire (not in cooldown).
* Records the current time as last-fired when returning true.
*/
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
const lastFired = this.cooldownMap.get(ruleId)
if (lastFired) {
const elapsedMinutes = (Date.now() - lastFired) / 60_000
if (elapsedMinutes < cooldownMinutes) return false
}
this.cooldownMap.set(ruleId, Date.now())
return true
}
/**
* Evaluates all enabled rules against current metrics.
* Fires alerts for rules that match and are not in cooldown.
*/
async evaluateRules(
payload: Payload,
metrics: Omit<SystemMetrics, 'timestamp'>,
): Promise<void> {
try {
const rules = await payload.find({
collection: 'monitoring-alert-rules',
where: { enabled: { equals: true } },
limit: 100,
})
for (const doc of rules.docs) {
const rule = doc as unknown as AlertRule
const value = getMetricValue(
metrics as unknown as Record<string, unknown>,
rule.metric,
)
if (value === undefined) continue
if (evaluateCondition(rule.condition, value, rule.threshold)) {
if (this.shouldFire(rule.id, rule.cooldownMinutes)) {
await this.dispatchAlert(payload, rule, value)
}
}
}
} catch (error) {
console.error('[AlertEvaluator] Error evaluating rules:', error)
}
}
/**
* Creates an alert history record and attempts to send notifications
* via the existing alert service.
*/
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
try {
await payload.create({
collection: 'monitoring-alert-history',
data: {
rule: rule.id,
metric: rule.metric,
value,
threshold: rule.threshold,
severity: rule.severity,
message,
channelsSent: rule.channels,
},
})
// Try to send via existing alert service
try {
const { sendAlert } = await import('../alerting/alert-service.js')
await sendAlert(payload, {
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
message,
details: {
metric: rule.metric,
value,
threshold: rule.threshold,
condition: rule.condition,
},
})
} catch {
// Alert service not available, history record is sufficient
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
}
} catch (error) {
console.error('[AlertEvaluator] Error dispatching alert:', error)
}
}
}

View file

@ -0,0 +1,100 @@
import { describe, it, expect } from 'vitest'
import {
getMetricValue,
evaluateCondition,
AlertEvaluator,
} from '@/lib/monitoring/alert-evaluator'
describe('getMetricValue', () => {
it('resolves simple paths', () => {
const metrics = { system: { cpuUsagePercent: 92 } }
expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92)
})
it('resolves deeply nested paths', () => {
const metrics = { services: { redis: { memoryUsedMB: 512 } } }
expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512)
})
it('returns undefined for non-existent paths', () => {
const metrics = { system: { cpuUsagePercent: 92 } }
expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined()
})
it('returns undefined for non-numeric values', () => {
const metrics = { system: { name: 'test' } }
expect(getMetricValue(metrics, 'system.name')).toBeUndefined()
})
it('returns undefined when traversing through null', () => {
const metrics = { system: null }
expect(getMetricValue(metrics as Record<string, unknown>, 'system.cpu')).toBeUndefined()
})
it('resolves top-level numeric values', () => {
const metrics = { uptime: 3600 }
expect(getMetricValue(metrics, 'uptime')).toBe(3600)
})
})
describe('evaluateCondition', () => {
it('gt: fires when value exceeds threshold', () => {
expect(evaluateCondition('gt', 92, 80)).toBe(true)
expect(evaluateCondition('gt', 80, 80)).toBe(false)
expect(evaluateCondition('gt', 45, 80)).toBe(false)
})
it('lt: fires when value is below threshold', () => {
expect(evaluateCondition('lt', 5, 10)).toBe(true)
expect(evaluateCondition('lt', 10, 10)).toBe(false)
expect(evaluateCondition('lt', 15, 10)).toBe(false)
})
it('gte: fires when value >= threshold', () => {
expect(evaluateCondition('gte', 80, 80)).toBe(true)
expect(evaluateCondition('gte', 81, 80)).toBe(true)
expect(evaluateCondition('gte', 79, 80)).toBe(false)
})
it('lte: fires when value <= threshold', () => {
expect(evaluateCondition('lte', 10, 10)).toBe(true)
expect(evaluateCondition('lte', 9, 10)).toBe(true)
expect(evaluateCondition('lte', 11, 10)).toBe(false)
})
it('eq: fires when value equals threshold', () => {
expect(evaluateCondition('eq', 100, 100)).toBe(true)
expect(evaluateCondition('eq', 99, 100)).toBe(false)
})
it('returns false for unknown condition', () => {
expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false)
})
})
describe('AlertEvaluator.shouldFire', () => {
it('allows first fire', () => {
const evaluator = new AlertEvaluator()
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
})
it('blocks during cooldown', () => {
const evaluator = new AlertEvaluator()
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
expect(evaluator.shouldFire('rule-1', 15)).toBe(false)
})
it('different rules have independent cooldowns', () => {
const evaluator = new AlertEvaluator()
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
expect(evaluator.shouldFire('rule-2', 15)).toBe(true)
})
it('allows fire after cooldown expires', () => {
const evaluator = new AlertEvaluator()
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
// With 0-minute cooldown, immediate re-fire should be allowed
// (elapsed time > 0 which is >= 0)
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
})
})