feat(monitoring): add alert evaluator with cooldown and multi-channel dispatch

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 00:24:10 +00:00 · 2026-02-15 00:30:21 +00:00 · 2026-02-15 00:30:21 +00:00 · 97c8f32967
commit 97c8f32967
parent dc14c5dbc7
2 changed files with 285 additions and 0 deletions
--- a/src/lib/monitoring/alert-evaluator.ts
+++ b/src/lib/monitoring/alert-evaluator.ts
@ -0,0 +1,185 @@
 /**
 * Alert Evaluator
 *
 * Evaluates monitoring alert rules against current system metrics.
 * Supports dot-notation metric paths, configurable conditions,
 * cooldown periods, and multi-channel alert dispatch.
 */
 import type { Payload } from 'payload'
 import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js'
 // ============================================================================
 // Pure Functions
 // ============================================================================
 /**
 * Resolves a dot-notation metric path against a metrics object.
 * Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
 */
 export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
  const parts = path.split('.')
  let current: unknown = metrics
  for (const part of parts) {
    if (current === null || current === undefined || typeof current !== 'object') {
      return undefined
    }
    current = (current as Record<string, unknown>)[part]
  }
  return typeof current === 'number' ? current : undefined
 }
 /**
 * Evaluates a condition against a value and threshold.
 */
 export function evaluateCondition(
  condition: AlertCondition,
  value: number,
  threshold: number,
 ): boolean {
  switch (condition) {
    case 'gt':
      return value > threshold
    case 'lt':
      return value < threshold
    case 'eq':
      return value === threshold
    case 'gte':
      return value >= threshold
    case 'lte':
      return value <= threshold
    default:
      return false
  }
 }
 // ============================================================================
 // Types
 // ============================================================================
 interface AlertRule {
  id: string
  name: string
  metric: string
  condition: AlertCondition
  threshold: number
  severity: AlertSeverity
  channels: string[]
  recipients?: {
    emails?: Array<{ email: string }>
    slackWebhook?: string
    discordWebhook?: string
  }
  cooldownMinutes: number
  enabled: boolean
 }
 // Maps AlertSeverity to the AlertLevel expected by alert-service
 const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
  warning: 'warning',
  error: 'error',
  critical: 'critical',
 }
 // ============================================================================
 // AlertEvaluator Class
 // ============================================================================
 export class AlertEvaluator {
  /** Tracks last fire time per rule to enforce cooldown */
  private cooldownMap: Map<string, number> = new Map()
  /**
   * Returns true if the rule should fire (not in cooldown).
   * Records the current time as last-fired when returning true.
   */
  shouldFire(ruleId: string, cooldownMinutes: number): boolean {
    const lastFired = this.cooldownMap.get(ruleId)
    if (lastFired) {
      const elapsedMinutes = (Date.now() - lastFired) / 60_000
      if (elapsedMinutes < cooldownMinutes) return false
    }
    this.cooldownMap.set(ruleId, Date.now())
    return true
  }
  /**
   * Evaluates all enabled rules against current metrics.
   * Fires alerts for rules that match and are not in cooldown.
   */
  async evaluateRules(
    payload: Payload,
    metrics: Omit<SystemMetrics, 'timestamp'>,
  ): Promise<void> {
    try {
      const rules = await payload.find({
        collection: 'monitoring-alert-rules',
        where: { enabled: { equals: true } },
        limit: 100,
      })
      for (const doc of rules.docs) {
        const rule = doc as unknown as AlertRule
        const value = getMetricValue(
          metrics as unknown as Record<string, unknown>,
          rule.metric,
        )
        if (value === undefined) continue
        if (evaluateCondition(rule.condition, value, rule.threshold)) {
          if (this.shouldFire(rule.id, rule.cooldownMinutes)) {
            await this.dispatchAlert(payload, rule, value)
          }
        }
      }
    } catch (error) {
      console.error('[AlertEvaluator] Error evaluating rules:', error)
    }
  }
  /**
   * Creates an alert history record and attempts to send notifications
   * via the existing alert service.
   */
  private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
    const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
    try {
      await payload.create({
        collection: 'monitoring-alert-history',
        data: {
          rule: rule.id,
          metric: rule.metric,
          value,
          threshold: rule.threshold,
          severity: rule.severity,
          message,
          channelsSent: rule.channels,
        },
      })
      // Try to send via existing alert service
      try {
        const { sendAlert } = await import('../alerting/alert-service.js')
        await sendAlert(payload, {
          level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
          title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
          message,
          details: {
            metric: rule.metric,
            value,
            threshold: rule.threshold,
            condition: rule.condition,
          },
        })
      } catch {
        // Alert service not available, history record is sufficient
        console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
      }
    } catch (error) {
      console.error('[AlertEvaluator] Error dispatching alert:', error)
    }
  }
 }
--- a/tests/unit/monitoring/alert-evaluator.unit.spec.ts
+++ b/tests/unit/monitoring/alert-evaluator.unit.spec.ts
@ -0,0 +1,100 @@
 import { describe, it, expect } from 'vitest'
 import {
  getMetricValue,
  evaluateCondition,
  AlertEvaluator,
 } from '@/lib/monitoring/alert-evaluator'
 describe('getMetricValue', () => {
  it('resolves simple paths', () => {
    const metrics = { system: { cpuUsagePercent: 92 } }
    expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92)
  })
  it('resolves deeply nested paths', () => {
    const metrics = { services: { redis: { memoryUsedMB: 512 } } }
    expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512)
  })
  it('returns undefined for non-existent paths', () => {
    const metrics = { system: { cpuUsagePercent: 92 } }
    expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined()
  })
  it('returns undefined for non-numeric values', () => {
    const metrics = { system: { name: 'test' } }
    expect(getMetricValue(metrics, 'system.name')).toBeUndefined()
  })
  it('returns undefined when traversing through null', () => {
    const metrics = { system: null }
    expect(getMetricValue(metrics as Record<string, unknown>, 'system.cpu')).toBeUndefined()
  })
  it('resolves top-level numeric values', () => {
    const metrics = { uptime: 3600 }
    expect(getMetricValue(metrics, 'uptime')).toBe(3600)
  })
 })
 describe('evaluateCondition', () => {
  it('gt: fires when value exceeds threshold', () => {
    expect(evaluateCondition('gt', 92, 80)).toBe(true)
    expect(evaluateCondition('gt', 80, 80)).toBe(false)
    expect(evaluateCondition('gt', 45, 80)).toBe(false)
  })
  it('lt: fires when value is below threshold', () => {
    expect(evaluateCondition('lt', 5, 10)).toBe(true)
    expect(evaluateCondition('lt', 10, 10)).toBe(false)
    expect(evaluateCondition('lt', 15, 10)).toBe(false)
  })
  it('gte: fires when value >= threshold', () => {
    expect(evaluateCondition('gte', 80, 80)).toBe(true)
    expect(evaluateCondition('gte', 81, 80)).toBe(true)
    expect(evaluateCondition('gte', 79, 80)).toBe(false)
  })
  it('lte: fires when value <= threshold', () => {
    expect(evaluateCondition('lte', 10, 10)).toBe(true)
    expect(evaluateCondition('lte', 9, 10)).toBe(true)
    expect(evaluateCondition('lte', 11, 10)).toBe(false)
  })
  it('eq: fires when value equals threshold', () => {
    expect(evaluateCondition('eq', 100, 100)).toBe(true)
    expect(evaluateCondition('eq', 99, 100)).toBe(false)
  })
  it('returns false for unknown condition', () => {
    expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false)
  })
 })
 describe('AlertEvaluator.shouldFire', () => {
  it('allows first fire', () => {
    const evaluator = new AlertEvaluator()
    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
  })
  it('blocks during cooldown', () => {
    const evaluator = new AlertEvaluator()
    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
    expect(evaluator.shouldFire('rule-1', 15)).toBe(false)
  })
  it('different rules have independent cooldowns', () => {
    const evaluator = new AlertEvaluator()
    expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
    expect(evaluator.shouldFire('rule-2', 15)).toBe(true)
  })
  it('allows fire after cooldown expires', () => {
    const evaluator = new AlertEvaluator()
    expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
    // With 0-minute cooldown, immediate re-fire should be allowed
    // (elapsed time > 0 which is >= 0)
    expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
  })
 })