feat(monitoring): add alert evaluator with cooldown and multi-channel dispatch

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 22:04:10 +00:00 · 2026-02-15 00:30:21 +00:00 · 2026-02-15 00:30:21 +00:00 · 97c8f32967
commit 97c8f32967
parent dc14c5dbc7
2 changed files with 285 additions and 0 deletions
--- a/src/lib/monitoring/alert-evaluator.ts
+++ b/src/lib/monitoring/alert-evaluator.ts
@ -0,0 +1,185 @@
+/**
+ * Alert Evaluator
+ *
+ * Evaluates monitoring alert rules against current system metrics.
+ * Supports dot-notation metric paths, configurable conditions,
+ * cooldown periods, and multi-channel alert dispatch.
+ */
+
+import type { Payload } from 'payload'
+import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js'
+
+// ============================================================================
+// Pure Functions
+// ============================================================================
+
+/**
+ * Resolves a dot-notation metric path against a metrics object.
+ * Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
+ */
+export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
+  const parts = path.split('.')
+  let current: unknown = metrics
+
+  for (const part of parts) {
+    if (current === null || current === undefined || typeof current !== 'object') {
+      return undefined
+    }
+    current = (current as Record<string, unknown>)[part]
+  }
+
+  return typeof current === 'number' ? current : undefined
+}
+
+/**
+ * Evaluates a condition against a value and threshold.
+ */
+export function evaluateCondition(
+  condition: AlertCondition,
+  value: number,
+  threshold: number,
+): boolean {
+  switch (condition) {
+    case 'gt':
+      return value > threshold
+    case 'lt':
+      return value < threshold
+    case 'eq':
+      return value === threshold
+    case 'gte':
+      return value >= threshold
+    case 'lte':
+      return value <= threshold
+    default:
+      return false
+  }
+}
+
+// ============================================================================
+// Types
+// ============================================================================
+
+interface AlertRule {
+  id: string
+  name: string
+  metric: string
+  condition: AlertCondition
+  threshold: number
+  severity: AlertSeverity
+  channels: string[]
+  recipients?: {
+    emails?: Array<{ email: string }>
+    slackWebhook?: string
+    discordWebhook?: string
+  }
+  cooldownMinutes: number
+  enabled: boolean
+}
+
+// Maps AlertSeverity to the AlertLevel expected by alert-service
+const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
+  warning: 'warning',
+  error: 'error',
+  critical: 'critical',
+}
+
+// ============================================================================
+// AlertEvaluator Class
+// ============================================================================
+
+export class AlertEvaluator {
+  /** Tracks last fire time per rule to enforce cooldown */
+  private cooldownMap: Map<string, number> = new Map()
+
+  /**
+   * Returns true if the rule should fire (not in cooldown).
+   * Records the current time as last-fired when returning true.
+   */
+  shouldFire(ruleId: string, cooldownMinutes: number): boolean {
+    const lastFired = this.cooldownMap.get(ruleId)
+    if (lastFired) {
+      const elapsedMinutes = (Date.now() - lastFired) / 60_000
+      if (elapsedMinutes < cooldownMinutes) return false
+    }
+    this.cooldownMap.set(ruleId, Date.now())
+    return true
+  }
+
+  /**
+   * Evaluates all enabled rules against current metrics.
+   * Fires alerts for rules that match and are not in cooldown.
+   */
+  async evaluateRules(
+    payload: Payload,
+    metrics: Omit<SystemMetrics, 'timestamp'>,
+  ): Promise<void> {
+    try {
+      const rules = await payload.find({
+        collection: 'monitoring-alert-rules',
+        where: { enabled: { equals: true } },
+        limit: 100,
+      })
+
+      for (const doc of rules.docs) {
+        const rule = doc as unknown as AlertRule
+        const value = getMetricValue(
+          metrics as unknown as Record<string, unknown>,
+          rule.metric,
+        )
+        if (value === undefined) continue
+
+        if (evaluateCondition(rule.condition, value, rule.threshold)) {
+          if (this.shouldFire(rule.id, rule.cooldownMinutes)) {
+            await this.dispatchAlert(payload, rule, value)
+          }
+        }
+      }
+    } catch (error) {
+      console.error('[AlertEvaluator] Error evaluating rules:', error)
+    }
+  }
+
+  /**
+   * Creates an alert history record and attempts to send notifications
+   * via the existing alert service.
+   */
+  private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
+    const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
+
+    try {
+      await payload.create({
+        collection: 'monitoring-alert-history',
+        data: {
+          rule: rule.id,
+          metric: rule.metric,
+          value,
+          threshold: rule.threshold,
+          severity: rule.severity,
+          message,
+          channelsSent: rule.channels,
+        },
+      })
+
+      // Try to send via existing alert service
+      try {
+        const { sendAlert } = await import('../alerting/alert-service.js')
+        await sendAlert(payload, {
+          level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
+          title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
+          message,
+          details: {
+            metric: rule.metric,
+            value,
+            threshold: rule.threshold,
+            condition: rule.condition,
+          },
+        })
+      } catch {
+        // Alert service not available, history record is sufficient
+        console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
+      }
+    } catch (error) {
+      console.error('[AlertEvaluator] Error dispatching alert:', error)
+    }
+  }
+}
--- a/tests/unit/monitoring/alert-evaluator.unit.spec.ts
+++ b/tests/unit/monitoring/alert-evaluator.unit.spec.ts
@ -0,0 +1,100 @@
+import { describe, it, expect } from 'vitest'
+import {
+  getMetricValue,
+  evaluateCondition,
+  AlertEvaluator,
+} from '@/lib/monitoring/alert-evaluator'
+
+describe('getMetricValue', () => {
+  it('resolves simple paths', () => {
+    const metrics = { system: { cpuUsagePercent: 92 } }
+    expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92)
+  })
+
+  it('resolves deeply nested paths', () => {
+    const metrics = { services: { redis: { memoryUsedMB: 512 } } }
+    expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512)
+  })
+
+  it('returns undefined for non-existent paths', () => {
+    const metrics = { system: { cpuUsagePercent: 92 } }
+    expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined()
+  })
+
+  it('returns undefined for non-numeric values', () => {
+    const metrics = { system: { name: 'test' } }
+    expect(getMetricValue(metrics, 'system.name')).toBeUndefined()
+  })
+
+  it('returns undefined when traversing through null', () => {
+    const metrics = { system: null }
+    expect(getMetricValue(metrics as Record<string, unknown>, 'system.cpu')).toBeUndefined()
+  })
+
+  it('resolves top-level numeric values', () => {
+    const metrics = { uptime: 3600 }
+    expect(getMetricValue(metrics, 'uptime')).toBe(3600)
+  })
+})
+
+describe('evaluateCondition', () => {
+  it('gt: fires when value exceeds threshold', () => {
+    expect(evaluateCondition('gt', 92, 80)).toBe(true)
+    expect(evaluateCondition('gt', 80, 80)).toBe(false)
+    expect(evaluateCondition('gt', 45, 80)).toBe(false)
+  })
+
+  it('lt: fires when value is below threshold', () => {
+    expect(evaluateCondition('lt', 5, 10)).toBe(true)
+    expect(evaluateCondition('lt', 10, 10)).toBe(false)
+    expect(evaluateCondition('lt', 15, 10)).toBe(false)
+  })
+
+  it('gte: fires when value >= threshold', () => {
+    expect(evaluateCondition('gte', 80, 80)).toBe(true)
+    expect(evaluateCondition('gte', 81, 80)).toBe(true)
+    expect(evaluateCondition('gte', 79, 80)).toBe(false)
+  })
+
+  it('lte: fires when value <= threshold', () => {
+    expect(evaluateCondition('lte', 10, 10)).toBe(true)
+    expect(evaluateCondition('lte', 9, 10)).toBe(true)
+    expect(evaluateCondition('lte', 11, 10)).toBe(false)
+  })
+
+  it('eq: fires when value equals threshold', () => {
+    expect(evaluateCondition('eq', 100, 100)).toBe(true)
+    expect(evaluateCondition('eq', 99, 100)).toBe(false)
+  })
+
+  it('returns false for unknown condition', () => {
+    expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false)
+  })
+})
+
+describe('AlertEvaluator.shouldFire', () => {
+  it('allows first fire', () => {
+    const evaluator = new AlertEvaluator()
+    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
+  })
+
+  it('blocks during cooldown', () => {
+    const evaluator = new AlertEvaluator()
+    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
+    expect(evaluator.shouldFire('rule-1', 15)).toBe(false)
+  })
+
+  it('different rules have independent cooldowns', () => {
+    const evaluator = new AlertEvaluator()
+    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
+    expect(evaluator.shouldFire('rule-2', 15)).toBe(true)
+  })
+
+  it('allows fire after cooldown expires', () => {
+    const evaluator = new AlertEvaluator()
+    expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
+    // With 0-minute cooldown, immediate re-fire should be allowed
+    // (elapsed time > 0 which is >= 0)
+    expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
+  })
+})