mirror of
https://github.com/complexcaresolutions/cms.c2sgmbh.git
synced 2026-03-18 00:24:10 +00:00
feat(monitoring): add alert evaluator with cooldown and multi-channel dispatch
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
dc14c5dbc7
commit
97c8f32967
2 changed files with 285 additions and 0 deletions
185
src/lib/monitoring/alert-evaluator.ts
Normal file
185
src/lib/monitoring/alert-evaluator.ts
Normal file
|
|
@ -0,0 +1,185 @@
|
||||||
|
/**
|
||||||
|
* Alert Evaluator
|
||||||
|
*
|
||||||
|
* Evaluates monitoring alert rules against current system metrics.
|
||||||
|
* Supports dot-notation metric paths, configurable conditions,
|
||||||
|
* cooldown periods, and multi-channel alert dispatch.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { Payload } from 'payload'
|
||||||
|
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js'
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Pure Functions
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolves a dot-notation metric path against a metrics object.
|
||||||
|
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
|
||||||
|
*/
|
||||||
|
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
|
||||||
|
const parts = path.split('.')
|
||||||
|
let current: unknown = metrics
|
||||||
|
|
||||||
|
for (const part of parts) {
|
||||||
|
if (current === null || current === undefined || typeof current !== 'object') {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
current = (current as Record<string, unknown>)[part]
|
||||||
|
}
|
||||||
|
|
||||||
|
return typeof current === 'number' ? current : undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates a condition against a value and threshold.
|
||||||
|
*/
|
||||||
|
export function evaluateCondition(
|
||||||
|
condition: AlertCondition,
|
||||||
|
value: number,
|
||||||
|
threshold: number,
|
||||||
|
): boolean {
|
||||||
|
switch (condition) {
|
||||||
|
case 'gt':
|
||||||
|
return value > threshold
|
||||||
|
case 'lt':
|
||||||
|
return value < threshold
|
||||||
|
case 'eq':
|
||||||
|
return value === threshold
|
||||||
|
case 'gte':
|
||||||
|
return value >= threshold
|
||||||
|
case 'lte':
|
||||||
|
return value <= threshold
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Types
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
interface AlertRule {
|
||||||
|
id: string
|
||||||
|
name: string
|
||||||
|
metric: string
|
||||||
|
condition: AlertCondition
|
||||||
|
threshold: number
|
||||||
|
severity: AlertSeverity
|
||||||
|
channels: string[]
|
||||||
|
recipients?: {
|
||||||
|
emails?: Array<{ email: string }>
|
||||||
|
slackWebhook?: string
|
||||||
|
discordWebhook?: string
|
||||||
|
}
|
||||||
|
cooldownMinutes: number
|
||||||
|
enabled: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maps AlertSeverity to the AlertLevel expected by alert-service
|
||||||
|
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
||||||
|
warning: 'warning',
|
||||||
|
error: 'error',
|
||||||
|
critical: 'critical',
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// AlertEvaluator Class
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
export class AlertEvaluator {
|
||||||
|
/** Tracks last fire time per rule to enforce cooldown */
|
||||||
|
private cooldownMap: Map<string, number> = new Map()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the rule should fire (not in cooldown).
|
||||||
|
* Records the current time as last-fired when returning true.
|
||||||
|
*/
|
||||||
|
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
|
||||||
|
const lastFired = this.cooldownMap.get(ruleId)
|
||||||
|
if (lastFired) {
|
||||||
|
const elapsedMinutes = (Date.now() - lastFired) / 60_000
|
||||||
|
if (elapsedMinutes < cooldownMinutes) return false
|
||||||
|
}
|
||||||
|
this.cooldownMap.set(ruleId, Date.now())
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Evaluates all enabled rules against current metrics.
|
||||||
|
* Fires alerts for rules that match and are not in cooldown.
|
||||||
|
*/
|
||||||
|
async evaluateRules(
|
||||||
|
payload: Payload,
|
||||||
|
metrics: Omit<SystemMetrics, 'timestamp'>,
|
||||||
|
): Promise<void> {
|
||||||
|
try {
|
||||||
|
const rules = await payload.find({
|
||||||
|
collection: 'monitoring-alert-rules',
|
||||||
|
where: { enabled: { equals: true } },
|
||||||
|
limit: 100,
|
||||||
|
})
|
||||||
|
|
||||||
|
for (const doc of rules.docs) {
|
||||||
|
const rule = doc as unknown as AlertRule
|
||||||
|
const value = getMetricValue(
|
||||||
|
metrics as unknown as Record<string, unknown>,
|
||||||
|
rule.metric,
|
||||||
|
)
|
||||||
|
if (value === undefined) continue
|
||||||
|
|
||||||
|
if (evaluateCondition(rule.condition, value, rule.threshold)) {
|
||||||
|
if (this.shouldFire(rule.id, rule.cooldownMinutes)) {
|
||||||
|
await this.dispatchAlert(payload, rule, value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[AlertEvaluator] Error evaluating rules:', error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an alert history record and attempts to send notifications
|
||||||
|
* via the existing alert service.
|
||||||
|
*/
|
||||||
|
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
|
||||||
|
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
|
||||||
|
|
||||||
|
try {
|
||||||
|
await payload.create({
|
||||||
|
collection: 'monitoring-alert-history',
|
||||||
|
data: {
|
||||||
|
rule: rule.id,
|
||||||
|
metric: rule.metric,
|
||||||
|
value,
|
||||||
|
threshold: rule.threshold,
|
||||||
|
severity: rule.severity,
|
||||||
|
message,
|
||||||
|
channelsSent: rule.channels,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Try to send via existing alert service
|
||||||
|
try {
|
||||||
|
const { sendAlert } = await import('../alerting/alert-service.js')
|
||||||
|
await sendAlert(payload, {
|
||||||
|
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
|
||||||
|
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
|
||||||
|
message,
|
||||||
|
details: {
|
||||||
|
metric: rule.metric,
|
||||||
|
value,
|
||||||
|
threshold: rule.threshold,
|
||||||
|
condition: rule.condition,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
} catch {
|
||||||
|
// Alert service not available, history record is sufficient
|
||||||
|
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[AlertEvaluator] Error dispatching alert:', error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
100
tests/unit/monitoring/alert-evaluator.unit.spec.ts
Normal file
100
tests/unit/monitoring/alert-evaluator.unit.spec.ts
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
import { describe, it, expect } from 'vitest'
|
||||||
|
import {
|
||||||
|
getMetricValue,
|
||||||
|
evaluateCondition,
|
||||||
|
AlertEvaluator,
|
||||||
|
} from '@/lib/monitoring/alert-evaluator'
|
||||||
|
|
||||||
|
describe('getMetricValue', () => {
|
||||||
|
it('resolves simple paths', () => {
|
||||||
|
const metrics = { system: { cpuUsagePercent: 92 } }
|
||||||
|
expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('resolves deeply nested paths', () => {
|
||||||
|
const metrics = { services: { redis: { memoryUsedMB: 512 } } }
|
||||||
|
expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns undefined for non-existent paths', () => {
|
||||||
|
const metrics = { system: { cpuUsagePercent: 92 } }
|
||||||
|
expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns undefined for non-numeric values', () => {
|
||||||
|
const metrics = { system: { name: 'test' } }
|
||||||
|
expect(getMetricValue(metrics, 'system.name')).toBeUndefined()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns undefined when traversing through null', () => {
|
||||||
|
const metrics = { system: null }
|
||||||
|
expect(getMetricValue(metrics as Record<string, unknown>, 'system.cpu')).toBeUndefined()
|
||||||
|
})
|
||||||
|
|
||||||
|
it('resolves top-level numeric values', () => {
|
||||||
|
const metrics = { uptime: 3600 }
|
||||||
|
expect(getMetricValue(metrics, 'uptime')).toBe(3600)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('evaluateCondition', () => {
|
||||||
|
it('gt: fires when value exceeds threshold', () => {
|
||||||
|
expect(evaluateCondition('gt', 92, 80)).toBe(true)
|
||||||
|
expect(evaluateCondition('gt', 80, 80)).toBe(false)
|
||||||
|
expect(evaluateCondition('gt', 45, 80)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('lt: fires when value is below threshold', () => {
|
||||||
|
expect(evaluateCondition('lt', 5, 10)).toBe(true)
|
||||||
|
expect(evaluateCondition('lt', 10, 10)).toBe(false)
|
||||||
|
expect(evaluateCondition('lt', 15, 10)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('gte: fires when value >= threshold', () => {
|
||||||
|
expect(evaluateCondition('gte', 80, 80)).toBe(true)
|
||||||
|
expect(evaluateCondition('gte', 81, 80)).toBe(true)
|
||||||
|
expect(evaluateCondition('gte', 79, 80)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('lte: fires when value <= threshold', () => {
|
||||||
|
expect(evaluateCondition('lte', 10, 10)).toBe(true)
|
||||||
|
expect(evaluateCondition('lte', 9, 10)).toBe(true)
|
||||||
|
expect(evaluateCondition('lte', 11, 10)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('eq: fires when value equals threshold', () => {
|
||||||
|
expect(evaluateCondition('eq', 100, 100)).toBe(true)
|
||||||
|
expect(evaluateCondition('eq', 99, 100)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns false for unknown condition', () => {
|
||||||
|
expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('AlertEvaluator.shouldFire', () => {
|
||||||
|
it('allows first fire', () => {
|
||||||
|
const evaluator = new AlertEvaluator()
|
||||||
|
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('blocks during cooldown', () => {
|
||||||
|
const evaluator = new AlertEvaluator()
|
||||||
|
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||||
|
expect(evaluator.shouldFire('rule-1', 15)).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('different rules have independent cooldowns', () => {
|
||||||
|
const evaluator = new AlertEvaluator()
|
||||||
|
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||||
|
expect(evaluator.shouldFire('rule-2', 15)).toBe(true)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('allows fire after cooldown expires', () => {
|
||||||
|
const evaluator = new AlertEvaluator()
|
||||||
|
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
|
||||||
|
// With 0-minute cooldown, immediate re-fire should be allowed
|
||||||
|
// (elapsed time > 0 which is >= 0)
|
||||||
|
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
|
||||||
|
})
|
||||||
|
})
|
||||||
Loading…
Reference in a new issue