diff --git a/src/lib/monitoring/alert-evaluator.ts b/src/lib/monitoring/alert-evaluator.ts new file mode 100644 index 0000000..e7000a4 --- /dev/null +++ b/src/lib/monitoring/alert-evaluator.ts @@ -0,0 +1,185 @@ +/** + * Alert Evaluator + * + * Evaluates monitoring alert rules against current system metrics. + * Supports dot-notation metric paths, configurable conditions, + * cooldown periods, and multi-channel alert dispatch. + */ + +import type { Payload } from 'payload' +import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js' + +// ============================================================================ +// Pure Functions +// ============================================================================ + +/** + * Resolves a dot-notation metric path against a metrics object. + * Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92 + */ +export function getMetricValue(metrics: Record, path: string): number | undefined { + const parts = path.split('.') + let current: unknown = metrics + + for (const part of parts) { + if (current === null || current === undefined || typeof current !== 'object') { + return undefined + } + current = (current as Record)[part] + } + + return typeof current === 'number' ? current : undefined +} + +/** + * Evaluates a condition against a value and threshold. + */ +export function evaluateCondition( + condition: AlertCondition, + value: number, + threshold: number, +): boolean { + switch (condition) { + case 'gt': + return value > threshold + case 'lt': + return value < threshold + case 'eq': + return value === threshold + case 'gte': + return value >= threshold + case 'lte': + return value <= threshold + default: + return false + } +} + +// ============================================================================ +// Types +// ============================================================================ + +interface AlertRule { + id: string + name: string + metric: string + condition: AlertCondition + threshold: number + severity: AlertSeverity + channels: string[] + recipients?: { + emails?: Array<{ email: string }> + slackWebhook?: string + discordWebhook?: string + } + cooldownMinutes: number + enabled: boolean +} + +// Maps AlertSeverity to the AlertLevel expected by alert-service +const SEVERITY_TO_LEVEL: Record = { + warning: 'warning', + error: 'error', + critical: 'critical', +} + +// ============================================================================ +// AlertEvaluator Class +// ============================================================================ + +export class AlertEvaluator { + /** Tracks last fire time per rule to enforce cooldown */ + private cooldownMap: Map = new Map() + + /** + * Returns true if the rule should fire (not in cooldown). + * Records the current time as last-fired when returning true. + */ + shouldFire(ruleId: string, cooldownMinutes: number): boolean { + const lastFired = this.cooldownMap.get(ruleId) + if (lastFired) { + const elapsedMinutes = (Date.now() - lastFired) / 60_000 + if (elapsedMinutes < cooldownMinutes) return false + } + this.cooldownMap.set(ruleId, Date.now()) + return true + } + + /** + * Evaluates all enabled rules against current metrics. + * Fires alerts for rules that match and are not in cooldown. + */ + async evaluateRules( + payload: Payload, + metrics: Omit, + ): Promise { + try { + const rules = await payload.find({ + collection: 'monitoring-alert-rules', + where: { enabled: { equals: true } }, + limit: 100, + }) + + for (const doc of rules.docs) { + const rule = doc as unknown as AlertRule + const value = getMetricValue( + metrics as unknown as Record, + rule.metric, + ) + if (value === undefined) continue + + if (evaluateCondition(rule.condition, value, rule.threshold)) { + if (this.shouldFire(rule.id, rule.cooldownMinutes)) { + await this.dispatchAlert(payload, rule, value) + } + } + } + } catch (error) { + console.error('[AlertEvaluator] Error evaluating rules:', error) + } + } + + /** + * Creates an alert history record and attempts to send notifications + * via the existing alert service. + */ + private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise { + const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})` + + try { + await payload.create({ + collection: 'monitoring-alert-history', + data: { + rule: rule.id, + metric: rule.metric, + value, + threshold: rule.threshold, + severity: rule.severity, + message, + channelsSent: rule.channels, + }, + }) + + // Try to send via existing alert service + try { + const { sendAlert } = await import('../alerting/alert-service.js') + await sendAlert(payload, { + level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical', + title: `[${rule.severity.toUpperCase()}] ${rule.name}`, + message, + details: { + metric: rule.metric, + value, + threshold: rule.threshold, + condition: rule.condition, + }, + }) + } catch { + // Alert service not available, history record is sufficient + console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`) + } + } catch (error) { + console.error('[AlertEvaluator] Error dispatching alert:', error) + } + } +} diff --git a/tests/unit/monitoring/alert-evaluator.unit.spec.ts b/tests/unit/monitoring/alert-evaluator.unit.spec.ts new file mode 100644 index 0000000..becdf62 --- /dev/null +++ b/tests/unit/monitoring/alert-evaluator.unit.spec.ts @@ -0,0 +1,100 @@ +import { describe, it, expect } from 'vitest' +import { + getMetricValue, + evaluateCondition, + AlertEvaluator, +} from '@/lib/monitoring/alert-evaluator' + +describe('getMetricValue', () => { + it('resolves simple paths', () => { + const metrics = { system: { cpuUsagePercent: 92 } } + expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92) + }) + + it('resolves deeply nested paths', () => { + const metrics = { services: { redis: { memoryUsedMB: 512 } } } + expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512) + }) + + it('returns undefined for non-existent paths', () => { + const metrics = { system: { cpuUsagePercent: 92 } } + expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined() + }) + + it('returns undefined for non-numeric values', () => { + const metrics = { system: { name: 'test' } } + expect(getMetricValue(metrics, 'system.name')).toBeUndefined() + }) + + it('returns undefined when traversing through null', () => { + const metrics = { system: null } + expect(getMetricValue(metrics as Record, 'system.cpu')).toBeUndefined() + }) + + it('resolves top-level numeric values', () => { + const metrics = { uptime: 3600 } + expect(getMetricValue(metrics, 'uptime')).toBe(3600) + }) +}) + +describe('evaluateCondition', () => { + it('gt: fires when value exceeds threshold', () => { + expect(evaluateCondition('gt', 92, 80)).toBe(true) + expect(evaluateCondition('gt', 80, 80)).toBe(false) + expect(evaluateCondition('gt', 45, 80)).toBe(false) + }) + + it('lt: fires when value is below threshold', () => { + expect(evaluateCondition('lt', 5, 10)).toBe(true) + expect(evaluateCondition('lt', 10, 10)).toBe(false) + expect(evaluateCondition('lt', 15, 10)).toBe(false) + }) + + it('gte: fires when value >= threshold', () => { + expect(evaluateCondition('gte', 80, 80)).toBe(true) + expect(evaluateCondition('gte', 81, 80)).toBe(true) + expect(evaluateCondition('gte', 79, 80)).toBe(false) + }) + + it('lte: fires when value <= threshold', () => { + expect(evaluateCondition('lte', 10, 10)).toBe(true) + expect(evaluateCondition('lte', 9, 10)).toBe(true) + expect(evaluateCondition('lte', 11, 10)).toBe(false) + }) + + it('eq: fires when value equals threshold', () => { + expect(evaluateCondition('eq', 100, 100)).toBe(true) + expect(evaluateCondition('eq', 99, 100)).toBe(false) + }) + + it('returns false for unknown condition', () => { + expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false) + }) +}) + +describe('AlertEvaluator.shouldFire', () => { + it('allows first fire', () => { + const evaluator = new AlertEvaluator() + expect(evaluator.shouldFire('rule-1', 15)).toBe(true) + }) + + it('blocks during cooldown', () => { + const evaluator = new AlertEvaluator() + expect(evaluator.shouldFire('rule-1', 15)).toBe(true) + expect(evaluator.shouldFire('rule-1', 15)).toBe(false) + }) + + it('different rules have independent cooldowns', () => { + const evaluator = new AlertEvaluator() + expect(evaluator.shouldFire('rule-1', 15)).toBe(true) + expect(evaluator.shouldFire('rule-2', 15)).toBe(true) + }) + + it('allows fire after cooldown expires', () => { + const evaluator = new AlertEvaluator() + expect(evaluator.shouldFire('rule-1', 0)).toBe(true) + // With 0-minute cooldown, immediate re-fire should be allowed + // (elapsed time > 0 which is >= 0) + expect(evaluator.shouldFire('rule-1', 0)).toBe(true) + }) +})