mirror of
https://github.com/complexcaresolutions/cms.c2sgmbh.git
synced 2026-03-17 22:04:10 +00:00
feat(monitoring): add alert evaluator with cooldown and multi-channel dispatch
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
dc14c5dbc7
commit
97c8f32967
2 changed files with 285 additions and 0 deletions
185
src/lib/monitoring/alert-evaluator.ts
Normal file
185
src/lib/monitoring/alert-evaluator.ts
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Alert Evaluator
|
||||
*
|
||||
* Evaluates monitoring alert rules against current system metrics.
|
||||
* Supports dot-notation metric paths, configurable conditions,
|
||||
* cooldown periods, and multi-channel alert dispatch.
|
||||
*/
|
||||
|
||||
import type { Payload } from 'payload'
|
||||
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types.js'
|
||||
|
||||
// ============================================================================
|
||||
// Pure Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Resolves a dot-notation metric path against a metrics object.
|
||||
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
|
||||
*/
|
||||
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
|
||||
const parts = path.split('.')
|
||||
let current: unknown = metrics
|
||||
|
||||
for (const part of parts) {
|
||||
if (current === null || current === undefined || typeof current !== 'object') {
|
||||
return undefined
|
||||
}
|
||||
current = (current as Record<string, unknown>)[part]
|
||||
}
|
||||
|
||||
return typeof current === 'number' ? current : undefined
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates a condition against a value and threshold.
|
||||
*/
|
||||
export function evaluateCondition(
|
||||
condition: AlertCondition,
|
||||
value: number,
|
||||
threshold: number,
|
||||
): boolean {
|
||||
switch (condition) {
|
||||
case 'gt':
|
||||
return value > threshold
|
||||
case 'lt':
|
||||
return value < threshold
|
||||
case 'eq':
|
||||
return value === threshold
|
||||
case 'gte':
|
||||
return value >= threshold
|
||||
case 'lte':
|
||||
return value <= threshold
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
interface AlertRule {
|
||||
id: string
|
||||
name: string
|
||||
metric: string
|
||||
condition: AlertCondition
|
||||
threshold: number
|
||||
severity: AlertSeverity
|
||||
channels: string[]
|
||||
recipients?: {
|
||||
emails?: Array<{ email: string }>
|
||||
slackWebhook?: string
|
||||
discordWebhook?: string
|
||||
}
|
||||
cooldownMinutes: number
|
||||
enabled: boolean
|
||||
}
|
||||
|
||||
// Maps AlertSeverity to the AlertLevel expected by alert-service
|
||||
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
||||
warning: 'warning',
|
||||
error: 'error',
|
||||
critical: 'critical',
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// AlertEvaluator Class
|
||||
// ============================================================================
|
||||
|
||||
export class AlertEvaluator {
|
||||
/** Tracks last fire time per rule to enforce cooldown */
|
||||
private cooldownMap: Map<string, number> = new Map()
|
||||
|
||||
/**
|
||||
* Returns true if the rule should fire (not in cooldown).
|
||||
* Records the current time as last-fired when returning true.
|
||||
*/
|
||||
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
|
||||
const lastFired = this.cooldownMap.get(ruleId)
|
||||
if (lastFired) {
|
||||
const elapsedMinutes = (Date.now() - lastFired) / 60_000
|
||||
if (elapsedMinutes < cooldownMinutes) return false
|
||||
}
|
||||
this.cooldownMap.set(ruleId, Date.now())
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates all enabled rules against current metrics.
|
||||
* Fires alerts for rules that match and are not in cooldown.
|
||||
*/
|
||||
async evaluateRules(
|
||||
payload: Payload,
|
||||
metrics: Omit<SystemMetrics, 'timestamp'>,
|
||||
): Promise<void> {
|
||||
try {
|
||||
const rules = await payload.find({
|
||||
collection: 'monitoring-alert-rules',
|
||||
where: { enabled: { equals: true } },
|
||||
limit: 100,
|
||||
})
|
||||
|
||||
for (const doc of rules.docs) {
|
||||
const rule = doc as unknown as AlertRule
|
||||
const value = getMetricValue(
|
||||
metrics as unknown as Record<string, unknown>,
|
||||
rule.metric,
|
||||
)
|
||||
if (value === undefined) continue
|
||||
|
||||
if (evaluateCondition(rule.condition, value, rule.threshold)) {
|
||||
if (this.shouldFire(rule.id, rule.cooldownMinutes)) {
|
||||
await this.dispatchAlert(payload, rule, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[AlertEvaluator] Error evaluating rules:', error)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an alert history record and attempts to send notifications
|
||||
* via the existing alert service.
|
||||
*/
|
||||
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
|
||||
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
|
||||
|
||||
try {
|
||||
await payload.create({
|
||||
collection: 'monitoring-alert-history',
|
||||
data: {
|
||||
rule: rule.id,
|
||||
metric: rule.metric,
|
||||
value,
|
||||
threshold: rule.threshold,
|
||||
severity: rule.severity,
|
||||
message,
|
||||
channelsSent: rule.channels,
|
||||
},
|
||||
})
|
||||
|
||||
// Try to send via existing alert service
|
||||
try {
|
||||
const { sendAlert } = await import('../alerting/alert-service.js')
|
||||
await sendAlert(payload, {
|
||||
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
|
||||
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
|
||||
message,
|
||||
details: {
|
||||
metric: rule.metric,
|
||||
value,
|
||||
threshold: rule.threshold,
|
||||
condition: rule.condition,
|
||||
},
|
||||
})
|
||||
} catch {
|
||||
// Alert service not available, history record is sufficient
|
||||
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[AlertEvaluator] Error dispatching alert:', error)
|
||||
}
|
||||
}
|
||||
}
|
||||
100
tests/unit/monitoring/alert-evaluator.unit.spec.ts
Normal file
100
tests/unit/monitoring/alert-evaluator.unit.spec.ts
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
import { describe, it, expect } from 'vitest'
|
||||
import {
|
||||
getMetricValue,
|
||||
evaluateCondition,
|
||||
AlertEvaluator,
|
||||
} from '@/lib/monitoring/alert-evaluator'
|
||||
|
||||
describe('getMetricValue', () => {
|
||||
it('resolves simple paths', () => {
|
||||
const metrics = { system: { cpuUsagePercent: 92 } }
|
||||
expect(getMetricValue(metrics, 'system.cpuUsagePercent')).toBe(92)
|
||||
})
|
||||
|
||||
it('resolves deeply nested paths', () => {
|
||||
const metrics = { services: { redis: { memoryUsedMB: 512 } } }
|
||||
expect(getMetricValue(metrics, 'services.redis.memoryUsedMB')).toBe(512)
|
||||
})
|
||||
|
||||
it('returns undefined for non-existent paths', () => {
|
||||
const metrics = { system: { cpuUsagePercent: 92 } }
|
||||
expect(getMetricValue(metrics, 'system.nonExistent')).toBeUndefined()
|
||||
})
|
||||
|
||||
it('returns undefined for non-numeric values', () => {
|
||||
const metrics = { system: { name: 'test' } }
|
||||
expect(getMetricValue(metrics, 'system.name')).toBeUndefined()
|
||||
})
|
||||
|
||||
it('returns undefined when traversing through null', () => {
|
||||
const metrics = { system: null }
|
||||
expect(getMetricValue(metrics as Record<string, unknown>, 'system.cpu')).toBeUndefined()
|
||||
})
|
||||
|
||||
it('resolves top-level numeric values', () => {
|
||||
const metrics = { uptime: 3600 }
|
||||
expect(getMetricValue(metrics, 'uptime')).toBe(3600)
|
||||
})
|
||||
})
|
||||
|
||||
describe('evaluateCondition', () => {
|
||||
it('gt: fires when value exceeds threshold', () => {
|
||||
expect(evaluateCondition('gt', 92, 80)).toBe(true)
|
||||
expect(evaluateCondition('gt', 80, 80)).toBe(false)
|
||||
expect(evaluateCondition('gt', 45, 80)).toBe(false)
|
||||
})
|
||||
|
||||
it('lt: fires when value is below threshold', () => {
|
||||
expect(evaluateCondition('lt', 5, 10)).toBe(true)
|
||||
expect(evaluateCondition('lt', 10, 10)).toBe(false)
|
||||
expect(evaluateCondition('lt', 15, 10)).toBe(false)
|
||||
})
|
||||
|
||||
it('gte: fires when value >= threshold', () => {
|
||||
expect(evaluateCondition('gte', 80, 80)).toBe(true)
|
||||
expect(evaluateCondition('gte', 81, 80)).toBe(true)
|
||||
expect(evaluateCondition('gte', 79, 80)).toBe(false)
|
||||
})
|
||||
|
||||
it('lte: fires when value <= threshold', () => {
|
||||
expect(evaluateCondition('lte', 10, 10)).toBe(true)
|
||||
expect(evaluateCondition('lte', 9, 10)).toBe(true)
|
||||
expect(evaluateCondition('lte', 11, 10)).toBe(false)
|
||||
})
|
||||
|
||||
it('eq: fires when value equals threshold', () => {
|
||||
expect(evaluateCondition('eq', 100, 100)).toBe(true)
|
||||
expect(evaluateCondition('eq', 99, 100)).toBe(false)
|
||||
})
|
||||
|
||||
it('returns false for unknown condition', () => {
|
||||
expect(evaluateCondition('unknown' as never, 50, 50)).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('AlertEvaluator.shouldFire', () => {
|
||||
it('allows first fire', () => {
|
||||
const evaluator = new AlertEvaluator()
|
||||
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||
})
|
||||
|
||||
it('blocks during cooldown', () => {
|
||||
const evaluator = new AlertEvaluator()
|
||||
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||
expect(evaluator.shouldFire('rule-1', 15)).toBe(false)
|
||||
})
|
||||
|
||||
it('different rules have independent cooldowns', () => {
|
||||
const evaluator = new AlertEvaluator()
|
||||
expect(evaluator.shouldFire('rule-1', 15)).toBe(true)
|
||||
expect(evaluator.shouldFire('rule-2', 15)).toBe(true)
|
||||
})
|
||||
|
||||
it('allows fire after cooldown expires', () => {
|
||||
const evaluator = new AlertEvaluator()
|
||||
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
|
||||
// With 0-minute cooldown, immediate re-fire should be allowed
|
||||
// (elapsed time > 0 which is >= 0)
|
||||
expect(evaluator.shouldFire('rule-1', 0)).toBe(true)
|
||||
})
|
||||
})
|
||||
Loading…
Reference in a new issue