mirror of
https://github.com/complexcaresolutions/cms.c2sgmbh.git
synced 2026-03-17 22:04:10 +00:00
fix(ci): increase build heap size and format monitoring files
Build was OOM-ing in CI with default Node heap limit. Added NODE_OPTIONS with 4GB heap. Also ran Prettier on monitoring files. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
884d33c0ae
commit
037835d1de
7 changed files with 481 additions and 510 deletions
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
|
|
@ -212,6 +212,7 @@ jobs:
|
||||||
- name: Build application
|
- name: Build application
|
||||||
run: pnpm build
|
run: pnpm build
|
||||||
env:
|
env:
|
||||||
|
NODE_OPTIONS: '--max-old-space-size=4096'
|
||||||
# Minimal env vars for build
|
# Minimal env vars for build
|
||||||
PAYLOAD_SECRET: build-secret-placeholder
|
PAYLOAD_SECRET: build-secret-placeholder
|
||||||
DATABASE_URI: postgresql://placeholder:placeholder@localhost:5432/placeholder
|
DATABASE_URI: postgresql://placeholder:placeholder@localhost:5432/placeholder
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,8 @@
|
||||||
* cooldown periods, and multi-channel alert dispatch.
|
* cooldown periods, and multi-channel alert dispatch.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { Payload } from 'payload'
|
import type { Payload } from "payload";
|
||||||
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types'
|
import type { AlertCondition, AlertSeverity, SystemMetrics } from "./types";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Pure Functions
|
// Pure Functions
|
||||||
|
|
@ -18,40 +18,36 @@ import type { AlertCondition, AlertSeverity, SystemMetrics } from './types'
|
||||||
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
|
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
|
||||||
*/
|
*/
|
||||||
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
|
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
|
||||||
const parts = path.split('.')
|
const parts = path.split(".");
|
||||||
let current: unknown = metrics
|
let current: unknown = metrics;
|
||||||
|
|
||||||
for (const part of parts) {
|
for (const part of parts) {
|
||||||
if (current === null || current === undefined || typeof current !== 'object') {
|
if (current === null || current === undefined || typeof current !== "object") {
|
||||||
return undefined
|
return undefined;
|
||||||
}
|
}
|
||||||
current = (current as Record<string, unknown>)[part]
|
current = (current as Record<string, unknown>)[part];
|
||||||
}
|
}
|
||||||
|
|
||||||
return typeof current === 'number' ? current : undefined
|
return typeof current === "number" ? current : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Evaluates a condition against a value and threshold.
|
* Evaluates a condition against a value and threshold.
|
||||||
*/
|
*/
|
||||||
export function evaluateCondition(
|
export function evaluateCondition(condition: AlertCondition, value: number, threshold: number): boolean {
|
||||||
condition: AlertCondition,
|
|
||||||
value: number,
|
|
||||||
threshold: number,
|
|
||||||
): boolean {
|
|
||||||
switch (condition) {
|
switch (condition) {
|
||||||
case 'gt':
|
case "gt":
|
||||||
return value > threshold
|
return value > threshold;
|
||||||
case 'lt':
|
case "lt":
|
||||||
return value < threshold
|
return value < threshold;
|
||||||
case 'eq':
|
case "eq":
|
||||||
return value === threshold
|
return value === threshold;
|
||||||
case 'gte':
|
case "gte":
|
||||||
return value >= threshold
|
return value >= threshold;
|
||||||
case 'lte':
|
case "lte":
|
||||||
return value <= threshold
|
return value <= threshold;
|
||||||
default:
|
default:
|
||||||
return false
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -60,28 +56,28 @@ export function evaluateCondition(
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
interface AlertRule {
|
interface AlertRule {
|
||||||
id: number
|
id: number;
|
||||||
name: string
|
name: string;
|
||||||
metric: string
|
metric: string;
|
||||||
condition: AlertCondition
|
condition: AlertCondition;
|
||||||
threshold: number
|
threshold: number;
|
||||||
severity: AlertSeverity
|
severity: AlertSeverity;
|
||||||
channels: Array<'email' | 'slack' | 'discord'>
|
channels: Array<"email" | "slack" | "discord">;
|
||||||
recipients?: {
|
recipients?: {
|
||||||
emails?: Array<{ email: string }>
|
emails?: Array<{ email: string }>;
|
||||||
slackWebhook?: string
|
slackWebhook?: string;
|
||||||
discordWebhook?: string
|
discordWebhook?: string;
|
||||||
}
|
};
|
||||||
cooldownMinutes: number
|
cooldownMinutes: number;
|
||||||
enabled: boolean
|
enabled: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Maps AlertSeverity to the AlertLevel expected by alert-service
|
// Maps AlertSeverity to the AlertLevel expected by alert-service
|
||||||
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
||||||
warning: 'warning',
|
warning: "warning",
|
||||||
error: 'error',
|
error: "error",
|
||||||
critical: 'critical',
|
critical: "critical",
|
||||||
}
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// AlertEvaluator Class
|
// AlertEvaluator Class
|
||||||
|
|
@ -89,58 +85,52 @@ const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
||||||
|
|
||||||
export class AlertEvaluator {
|
export class AlertEvaluator {
|
||||||
/** Tracks last fire time per rule to enforce cooldown */
|
/** Tracks last fire time per rule to enforce cooldown */
|
||||||
private cooldownMap: Map<string, number> = new Map()
|
private cooldownMap: Map<string, number> = new Map();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if the rule should fire (not in cooldown).
|
* Returns true if the rule should fire (not in cooldown).
|
||||||
*/
|
*/
|
||||||
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
|
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
|
||||||
const lastFired = this.cooldownMap.get(ruleId)
|
const lastFired = this.cooldownMap.get(ruleId);
|
||||||
if (lastFired) {
|
if (lastFired) {
|
||||||
const elapsedMinutes = (Date.now() - lastFired) / 60_000
|
const elapsedMinutes = (Date.now() - lastFired) / 60_000;
|
||||||
if (elapsedMinutes < cooldownMinutes) return false
|
if (elapsedMinutes < cooldownMinutes) return false;
|
||||||
}
|
}
|
||||||
return true
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Record that a rule fired successfully. */
|
/** Record that a rule fired successfully. */
|
||||||
recordFired(ruleId: string): void {
|
recordFired(ruleId: string): void {
|
||||||
this.cooldownMap.set(ruleId, Date.now())
|
this.cooldownMap.set(ruleId, Date.now());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Evaluates all enabled rules against current metrics.
|
* Evaluates all enabled rules against current metrics.
|
||||||
* Fires alerts for rules that match and are not in cooldown.
|
* Fires alerts for rules that match and are not in cooldown.
|
||||||
*/
|
*/
|
||||||
async evaluateRules(
|
async evaluateRules(payload: Payload, metrics: Omit<SystemMetrics, "timestamp">): Promise<void> {
|
||||||
payload: Payload,
|
|
||||||
metrics: Omit<SystemMetrics, 'timestamp'>,
|
|
||||||
): Promise<void> {
|
|
||||||
try {
|
try {
|
||||||
const rules = await payload.find({
|
const rules = await payload.find({
|
||||||
collection: 'monitoring-alert-rules',
|
collection: "monitoring-alert-rules",
|
||||||
where: { enabled: { equals: true } },
|
where: { enabled: { equals: true } },
|
||||||
limit: 100,
|
limit: 100,
|
||||||
})
|
});
|
||||||
|
|
||||||
for (const doc of rules.docs) {
|
for (const doc of rules.docs) {
|
||||||
const rule = doc as unknown as AlertRule
|
const rule = doc as unknown as AlertRule;
|
||||||
const value = getMetricValue(
|
const value = getMetricValue(metrics as unknown as Record<string, unknown>, rule.metric);
|
||||||
metrics as unknown as Record<string, unknown>,
|
if (value === undefined) continue;
|
||||||
rule.metric,
|
|
||||||
)
|
|
||||||
if (value === undefined) continue
|
|
||||||
|
|
||||||
if (evaluateCondition(rule.condition, value, rule.threshold)) {
|
if (evaluateCondition(rule.condition, value, rule.threshold)) {
|
||||||
const ruleKey = String(rule.id)
|
const ruleKey = String(rule.id);
|
||||||
if (this.shouldFire(ruleKey, rule.cooldownMinutes)) {
|
if (this.shouldFire(ruleKey, rule.cooldownMinutes)) {
|
||||||
await this.dispatchAlert(payload, rule, value)
|
await this.dispatchAlert(payload, rule, value);
|
||||||
this.recordFired(ruleKey)
|
this.recordFired(ruleKey);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('[AlertEvaluator] Error evaluating rules:', error)
|
console.error("[AlertEvaluator] Error evaluating rules:", error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -149,11 +139,11 @@ export class AlertEvaluator {
|
||||||
* via the existing alert service.
|
* via the existing alert service.
|
||||||
*/
|
*/
|
||||||
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
|
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
|
||||||
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
|
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await payload.create({
|
await payload.create({
|
||||||
collection: 'monitoring-alert-history',
|
collection: "monitoring-alert-history",
|
||||||
data: {
|
data: {
|
||||||
rule: rule.id,
|
rule: rule.id,
|
||||||
metric: rule.metric,
|
metric: rule.metric,
|
||||||
|
|
@ -163,13 +153,13 @@ export class AlertEvaluator {
|
||||||
message,
|
message,
|
||||||
channelsSent: rule.channels,
|
channelsSent: rule.channels,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
|
|
||||||
// Try to send via existing alert service
|
// Try to send via existing alert service
|
||||||
try {
|
try {
|
||||||
const { sendAlert } = await import('../alerting/alert-service.js')
|
const { sendAlert } = await import("../alerting/alert-service.js");
|
||||||
await sendAlert(payload, {
|
await sendAlert(payload, {
|
||||||
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
|
level: SEVERITY_TO_LEVEL[rule.severity] as "warning" | "error" | "critical",
|
||||||
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
|
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
|
||||||
message,
|
message,
|
||||||
details: {
|
details: {
|
||||||
|
|
@ -178,13 +168,13 @@ export class AlertEvaluator {
|
||||||
threshold: rule.threshold,
|
threshold: rule.threshold,
|
||||||
condition: rule.condition,
|
condition: rule.condition,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
} catch {
|
} catch {
|
||||||
// Alert service not available, history record is sufficient
|
// Alert service not available, history record is sufficient
|
||||||
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
|
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('[AlertEvaluator] Error dispatching alert:', error)
|
console.error("[AlertEvaluator] Error dispatching alert:", error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
* Falls back to console output when Payload is not yet initialized.
|
* Falls back to console output when Payload is not yet initialized.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { LogLevel, LogSource } from './types'
|
import type { LogLevel, LogSource } from "./types";
|
||||||
|
|
||||||
const LOG_LEVELS: Record<LogLevel, number> = {
|
const LOG_LEVELS: Record<LogLevel, number> = {
|
||||||
debug: 0,
|
debug: 0,
|
||||||
|
|
@ -13,63 +13,58 @@ const LOG_LEVELS: Record<LogLevel, number> = {
|
||||||
warn: 2,
|
warn: 2,
|
||||||
error: 3,
|
error: 3,
|
||||||
fatal: 4,
|
fatal: 4,
|
||||||
}
|
};
|
||||||
|
|
||||||
function getMinLevel(): LogLevel {
|
function getMinLevel(): LogLevel {
|
||||||
return (process.env.MONITORING_LOG_LEVEL as LogLevel) || 'info'
|
return (process.env.MONITORING_LOG_LEVEL as LogLevel) || "info";
|
||||||
}
|
}
|
||||||
|
|
||||||
function shouldLog(level: LogLevel): boolean {
|
function shouldLog(level: LogLevel): boolean {
|
||||||
return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()]
|
return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LogContext {
|
export interface LogContext {
|
||||||
requestId?: string
|
requestId?: string;
|
||||||
userId?: number
|
userId?: number;
|
||||||
tenant?: number
|
tenant?: number;
|
||||||
duration?: number
|
duration?: number;
|
||||||
[key: string]: unknown
|
[key: string]: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface MonitoringLoggerInstance {
|
export interface MonitoringLoggerInstance {
|
||||||
debug(message: string, context?: LogContext): void
|
debug(message: string, context?: LogContext): void;
|
||||||
info(message: string, context?: LogContext): void
|
info(message: string, context?: LogContext): void;
|
||||||
warn(message: string, context?: LogContext): void
|
warn(message: string, context?: LogContext): void;
|
||||||
error(message: string, context?: LogContext): void
|
error(message: string, context?: LogContext): void;
|
||||||
fatal(message: string, context?: LogContext): void
|
fatal(message: string, context?: LogContext): void;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Cached Payload instance — resolved once, reused for all subsequent writes. */
|
/** Cached Payload instance — resolved once, reused for all subsequent writes. */
|
||||||
let cachedPayload: any = null
|
let cachedPayload: any = null;
|
||||||
|
|
||||||
async function getPayloadInstance(): Promise<any> {
|
async function getPayloadInstance(): Promise<any> {
|
||||||
if (cachedPayload) return cachedPayload
|
if (cachedPayload) return cachedPayload;
|
||||||
const { getPayload } = await import('payload')
|
const { getPayload } = await import("payload");
|
||||||
const config = (await import(/* @vite-ignore */ '@payload-config')).default
|
const config = (await import(/* @vite-ignore */ "@payload-config")).default;
|
||||||
cachedPayload = await getPayload({ config })
|
cachedPayload = await getPayload({ config });
|
||||||
return cachedPayload
|
return cachedPayload;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reset cached instance (used in tests). */
|
/** Reset cached instance (used in tests). */
|
||||||
export function _resetPayloadCache(): void {
|
export function _resetPayloadCache(): void {
|
||||||
cachedPayload = null
|
cachedPayload = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function writeLog(
|
async function writeLog(source: LogSource, level: LogLevel, message: string, context?: LogContext): Promise<void> {
|
||||||
source: LogSource,
|
if (!shouldLog(level)) return;
|
||||||
level: LogLevel,
|
|
||||||
message: string,
|
|
||||||
context?: LogContext,
|
|
||||||
): Promise<void> {
|
|
||||||
if (!shouldLog(level)) return
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const payload = await getPayloadInstance()
|
const payload = await getPayloadInstance();
|
||||||
|
|
||||||
const { requestId, userId, tenant, duration, ...rest } = context || {}
|
const { requestId, userId, tenant, duration, ...rest } = context || {};
|
||||||
|
|
||||||
await payload.create({
|
await payload.create({
|
||||||
collection: 'monitoring-logs',
|
collection: "monitoring-logs",
|
||||||
data: {
|
data: {
|
||||||
level,
|
level,
|
||||||
source,
|
source,
|
||||||
|
|
@ -80,12 +75,12 @@ async function writeLog(
|
||||||
tenant,
|
tenant,
|
||||||
duration,
|
duration,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
} catch {
|
} catch {
|
||||||
// Fallback to console if Payload is not yet initialized
|
// Fallback to console if Payload is not yet initialized
|
||||||
cachedPayload = null
|
cachedPayload = null;
|
||||||
const prefix = `[${source}][${level.toUpperCase()}]`
|
const prefix = `[${source}][${level.toUpperCase()}]`;
|
||||||
console.log(prefix, message, context || '')
|
console.log(prefix, message, context || "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -94,16 +89,16 @@ export function createMonitoringLogger(source: LogSource): MonitoringLoggerInsta
|
||||||
return function logMessage(message: string, context?: LogContext): void {
|
return function logMessage(message: string, context?: LogContext): void {
|
||||||
// Fire-and-forget -- don't block the caller
|
// Fire-and-forget -- don't block the caller
|
||||||
writeLog(source, level, message, context).catch(function onError(err) {
|
writeLog(source, level, message, context).catch(function onError(err) {
|
||||||
console.error(`[MonitoringLogger] Failed to write ${level} log:`, err)
|
console.error(`[MonitoringLogger] Failed to write ${level} log:`, err);
|
||||||
})
|
});
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
debug: log('debug'),
|
debug: log("debug"),
|
||||||
info: log('info'),
|
info: log("info"),
|
||||||
warn: log('warn'),
|
warn: log("warn"),
|
||||||
error: log('error'),
|
error: log("error"),
|
||||||
fatal: log('fatal'),
|
fatal: log("fatal"),
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,8 @@
|
||||||
* dependency checks. Used by the monitoring dashboard and snapshot collector.
|
* dependency checks. Used by the monitoring dashboard and snapshot collector.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import os from 'node:os'
|
import os from "node:os";
|
||||||
import { execSync } from 'node:child_process'
|
import { execSync } from "node:child_process";
|
||||||
import type {
|
import type {
|
||||||
SystemHealth,
|
SystemHealth,
|
||||||
ProcessStatus,
|
ProcessStatus,
|
||||||
|
|
@ -21,9 +21,9 @@ import type {
|
||||||
SecurityMetricsStatus,
|
SecurityMetricsStatus,
|
||||||
PerformanceMetrics,
|
PerformanceMetrics,
|
||||||
SystemMetrics,
|
SystemMetrics,
|
||||||
} from './types'
|
} from "./types";
|
||||||
import { checkSecretsHealth } from '../security/secrets-health'
|
import { checkSecretsHealth } from "../security/secrets-health";
|
||||||
import { getSecurityMetricsSnapshot } from '../security/security-observability'
|
import { getSecurityMetricsSnapshot } from "../security/security-observability";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// System Health
|
// System Health
|
||||||
|
|
@ -34,15 +34,15 @@ import { getSecurityMetricsSnapshot } from '../security/security-observability'
|
||||||
* CPU usage is calculated by sampling /proc/stat twice with 100ms delay.
|
* CPU usage is calculated by sampling /proc/stat twice with 100ms delay.
|
||||||
*/
|
*/
|
||||||
export async function checkSystemHealth(): Promise<SystemHealth> {
|
export async function checkSystemHealth(): Promise<SystemHealth> {
|
||||||
const cpuUsagePercent = await getCpuUsage()
|
const cpuUsagePercent = await getCpuUsage();
|
||||||
|
|
||||||
const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024)
|
const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024);
|
||||||
const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024)
|
const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024);
|
||||||
const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100)
|
const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100);
|
||||||
|
|
||||||
const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage()
|
const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage();
|
||||||
|
|
||||||
const [loadAvg1, loadAvg5] = os.loadavg()
|
const [loadAvg1, loadAvg5] = os.loadavg();
|
||||||
|
|
||||||
return {
|
return {
|
||||||
cpuUsagePercent: roundToOneDecimal(cpuUsagePercent),
|
cpuUsagePercent: roundToOneDecimal(cpuUsagePercent),
|
||||||
|
|
@ -55,7 +55,7 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
|
||||||
loadAvg1: roundToTwoDecimals(loadAvg1),
|
loadAvg1: roundToTwoDecimals(loadAvg1),
|
||||||
loadAvg5: roundToTwoDecimals(loadAvg5),
|
loadAvg5: roundToTwoDecimals(loadAvg5),
|
||||||
uptime: Math.round(os.uptime()),
|
uptime: Math.round(os.uptime()),
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
@ -64,214 +64,212 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
|
||||||
|
|
||||||
export async function checkRedis(): Promise<RedisStatus> {
|
export async function checkRedis(): Promise<RedisStatus> {
|
||||||
const offlineStatus: RedisStatus = {
|
const offlineStatus: RedisStatus = {
|
||||||
status: 'offline',
|
status: "offline",
|
||||||
memoryUsedMB: 0,
|
memoryUsedMB: 0,
|
||||||
connectedClients: 0,
|
connectedClients: 0,
|
||||||
opsPerSec: 0,
|
opsPerSec: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { getRedisClient } = await import('../redis.js')
|
const { getRedisClient } = await import("../redis.js");
|
||||||
const client = getRedisClient()
|
const client = getRedisClient();
|
||||||
if (!client) return offlineStatus
|
if (!client) return offlineStatus;
|
||||||
|
|
||||||
const info = await client.info()
|
const info = await client.info();
|
||||||
const getVal = (key: string): number => {
|
const getVal = (key: string): number => {
|
||||||
const match = info.match(new RegExp(`${key}:(\\d+)`))
|
const match = info.match(new RegExp(`${key}:(\\d+)`));
|
||||||
return match ? parseInt(match[1], 10) : 0
|
return match ? parseInt(match[1], 10) : 0;
|
||||||
}
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
status: 'online',
|
status: "online",
|
||||||
memoryUsedMB: Math.round(getVal('used_memory') / 1024 / 1024),
|
memoryUsedMB: Math.round(getVal("used_memory") / 1024 / 1024),
|
||||||
connectedClients: getVal('connected_clients'),
|
connectedClients: getVal("connected_clients"),
|
||||||
opsPerSec: getVal('instantaneous_ops_per_sec'),
|
opsPerSec: getVal("instantaneous_ops_per_sec"),
|
||||||
}
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return offlineStatus
|
return offlineStatus;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkPostgresql(): Promise<PostgresqlStatus> {
|
export async function checkPostgresql(): Promise<PostgresqlStatus> {
|
||||||
const offlineStatus: PostgresqlStatus = {
|
const offlineStatus: PostgresqlStatus = {
|
||||||
status: 'offline',
|
status: "offline",
|
||||||
connections: 0,
|
connections: 0,
|
||||||
maxConnections: 50,
|
maxConnections: 50,
|
||||||
latencyMs: -1,
|
latencyMs: -1,
|
||||||
}
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { getPayload } = await import('payload')
|
const { getPayload } = await import("payload");
|
||||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||||
|
|
||||||
const start = Date.now()
|
const start = Date.now();
|
||||||
await payload.find({ collection: 'users', limit: 0 })
|
await payload.find({ collection: "users", limit: 0 });
|
||||||
const latencyMs = Date.now() - start
|
const latencyMs = Date.now() - start;
|
||||||
|
|
||||||
let connections = 0
|
let connections = 0;
|
||||||
let maxConnections = 50
|
let maxConnections = 50;
|
||||||
try {
|
try {
|
||||||
const connResult = runPsql(
|
const connResult = runPsql(
|
||||||
'-h 10.10.181.101 -U payload -d payload_db -t -c "SELECT count(*) FROM pg_stat_activity WHERE datname = \'payload_db\'"',
|
"-h 10.10.181.101 -U payload -d payload_db -t -c \"SELECT count(*) FROM pg_stat_activity WHERE datname = 'payload_db'\"",
|
||||||
)
|
);
|
||||||
connections = parseInt(connResult.trim(), 10) || 0
|
connections = parseInt(connResult.trim(), 10) || 0;
|
||||||
|
|
||||||
const maxResult = runPsql(
|
const maxResult = runPsql('-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"');
|
||||||
'-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"',
|
maxConnections = parseInt(maxResult.trim(), 10) || 50;
|
||||||
)
|
|
||||||
maxConnections = parseInt(maxResult.trim(), 10) || 50
|
|
||||||
} catch {
|
} catch {
|
||||||
// psql unavailable -- latency check already proves connectivity
|
// psql unavailable -- latency check already proves connectivity
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
status: latencyMs < 1000 ? 'online' : 'warning',
|
status: latencyMs < 1000 ? "online" : "warning",
|
||||||
connections,
|
connections,
|
||||||
maxConnections,
|
maxConnections,
|
||||||
latencyMs,
|
latencyMs,
|
||||||
}
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return offlineStatus
|
return offlineStatus;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkPgBouncer(): Promise<PgBouncerStatus> {
|
export async function checkPgBouncer(): Promise<PgBouncerStatus> {
|
||||||
const offlineStatus: PgBouncerStatus = {
|
const offlineStatus: PgBouncerStatus = {
|
||||||
status: 'offline',
|
status: "offline",
|
||||||
activeConnections: 0,
|
activeConnections: 0,
|
||||||
waitingClients: 0,
|
waitingClients: 0,
|
||||||
poolSize: 0,
|
poolSize: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"')
|
const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"');
|
||||||
|
|
||||||
// SHOW POOLS columns: database | user | cl_active | cl_waiting | sv_active | sv_idle | pool_size | ...
|
// SHOW POOLS columns: database | user | cl_active | cl_waiting | sv_active | sv_idle | pool_size | ...
|
||||||
const lines = output
|
const lines = output
|
||||||
.trim()
|
.trim()
|
||||||
.split('\n')
|
.split("\n")
|
||||||
.filter((l) => l.includes('payload'))
|
.filter((l) => l.includes("payload"));
|
||||||
|
|
||||||
let activeConnections = 0
|
let activeConnections = 0;
|
||||||
let waitingClients = 0
|
let waitingClients = 0;
|
||||||
let poolSize = 20
|
let poolSize = 20;
|
||||||
|
|
||||||
for (const line of lines) {
|
for (const line of lines) {
|
||||||
const parts = line.split('|').map((s) => s.trim())
|
const parts = line.split("|").map((s) => s.trim());
|
||||||
activeConnections += parseInt(parts[2], 10) || 0
|
activeConnections += parseInt(parts[2], 10) || 0;
|
||||||
waitingClients += parseInt(parts[3], 10) || 0
|
waitingClients += parseInt(parts[3], 10) || 0;
|
||||||
poolSize = parseInt(parts[6], 10) || 20
|
poolSize = parseInt(parts[6], 10) || 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
return { status: 'online', activeConnections, waitingClients, poolSize }
|
return { status: "online", activeConnections, waitingClients, poolSize };
|
||||||
} catch {
|
} catch {
|
||||||
return offlineStatus
|
return offlineStatus;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface QueueCounts {
|
export interface QueueCounts {
|
||||||
waiting: number
|
waiting: number;
|
||||||
active: number
|
active: number;
|
||||||
completed: number
|
completed: number;
|
||||||
failed: number
|
failed: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkQueues(): Promise<Record<string, QueueCounts>> {
|
export async function checkQueues(): Promise<Record<string, QueueCounts>> {
|
||||||
try {
|
try {
|
||||||
const { Queue } = await import('bullmq')
|
const { Queue } = await import("bullmq");
|
||||||
const { getQueueRedisConnection } = await import('../queue/queue-service.js')
|
const { getQueueRedisConnection } = await import("../queue/queue-service.js");
|
||||||
|
|
||||||
const connection = getQueueRedisConnection()
|
const connection = getQueueRedisConnection();
|
||||||
// Queue names matching QUEUE_NAMES in queue-service.ts
|
// Queue names matching QUEUE_NAMES in queue-service.ts
|
||||||
const queueNames = ['email', 'pdf', 'cleanup', 'youtube-upload']
|
const queueNames = ["email", "pdf", "cleanup", "youtube-upload"];
|
||||||
const results: Record<string, QueueCounts> = {}
|
const results: Record<string, QueueCounts> = {};
|
||||||
|
|
||||||
for (const name of queueNames) {
|
for (const name of queueNames) {
|
||||||
try {
|
try {
|
||||||
const queue = new Queue(name, { connection })
|
const queue = new Queue(name, { connection });
|
||||||
const counts = await queue.getJobCounts()
|
const counts = await queue.getJobCounts();
|
||||||
results[name] = {
|
results[name] = {
|
||||||
waiting: counts.waiting || 0,
|
waiting: counts.waiting || 0,
|
||||||
active: counts.active || 0,
|
active: counts.active || 0,
|
||||||
completed: counts.completed || 0,
|
completed: counts.completed || 0,
|
||||||
failed: counts.failed || 0,
|
failed: counts.failed || 0,
|
||||||
}
|
};
|
||||||
await queue.close()
|
await queue.close();
|
||||||
} catch {
|
} catch {
|
||||||
results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 }
|
results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results
|
return results;
|
||||||
} catch {
|
} catch {
|
||||||
return {}
|
return {};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkSmtp(): Promise<SmtpStatus> {
|
export async function checkSmtp(): Promise<SmtpStatus> {
|
||||||
const now = new Date().toISOString()
|
const now = new Date().toISOString();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const nodemailer = await import('nodemailer')
|
const nodemailer = await import("nodemailer");
|
||||||
const transporter = nodemailer.createTransport({
|
const transporter = nodemailer.createTransport({
|
||||||
host: process.env.SMTP_HOST,
|
host: process.env.SMTP_HOST,
|
||||||
port: parseInt(process.env.SMTP_PORT || '587', 10),
|
port: parseInt(process.env.SMTP_PORT || "587", 10),
|
||||||
secure: process.env.SMTP_SECURE === 'true',
|
secure: process.env.SMTP_SECURE === "true",
|
||||||
auth: {
|
auth: {
|
||||||
user: process.env.SMTP_USER,
|
user: process.env.SMTP_USER,
|
||||||
pass: process.env.SMTP_PASS,
|
pass: process.env.SMTP_PASS,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
|
|
||||||
const start = Date.now()
|
const start = Date.now();
|
||||||
await transporter.verify()
|
await transporter.verify();
|
||||||
const responseTimeMs = Date.now() - start
|
const responseTimeMs = Date.now() - start;
|
||||||
|
|
||||||
return { status: 'online', lastCheck: now, responseTimeMs }
|
return { status: "online", lastCheck: now, responseTimeMs };
|
||||||
} catch {
|
} catch {
|
||||||
return { status: 'offline', lastCheck: now, responseTimeMs: -1 }
|
return { status: "offline", lastCheck: now, responseTimeMs: -1 };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkOAuthTokens(): Promise<{
|
export async function checkOAuthTokens(): Promise<{
|
||||||
metaOAuth: OAuthTokenStatus
|
metaOAuth: OAuthTokenStatus;
|
||||||
youtubeOAuth: OAuthTokenStatus
|
youtubeOAuth: OAuthTokenStatus;
|
||||||
}> {
|
}> {
|
||||||
const errorStatus: OAuthTokenStatus = {
|
const errorStatus: OAuthTokenStatus = {
|
||||||
status: 'error',
|
status: "error",
|
||||||
tokensTotal: 0,
|
tokensTotal: 0,
|
||||||
tokensExpiringSoon: 0,
|
tokensExpiringSoon: 0,
|
||||||
tokensExpired: 0,
|
tokensExpired: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { getPayload } = await import('payload')
|
const { getPayload } = await import("payload");
|
||||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||||
|
|
||||||
const accounts = await payload.find({
|
const accounts = await payload.find({
|
||||||
collection: 'social-accounts',
|
collection: "social-accounts",
|
||||||
limit: 100,
|
limit: 100,
|
||||||
where: { status: { equals: 'connected' } },
|
where: { status: { equals: "connected" } },
|
||||||
})
|
});
|
||||||
|
|
||||||
const sevenDaysFromNow = new Date()
|
const sevenDaysFromNow = new Date();
|
||||||
sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7)
|
sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7);
|
||||||
const now = new Date()
|
const now = new Date();
|
||||||
|
|
||||||
const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }
|
const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
|
||||||
const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }
|
const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
|
||||||
|
|
||||||
for (const account of accounts.docs) {
|
for (const account of accounts.docs) {
|
||||||
const doc = account as unknown as Record<string, unknown>
|
const doc = account as unknown as Record<string, unknown>;
|
||||||
const target = doc.platform === 'youtube' ? youtube : meta
|
const target = doc.platform === "youtube" ? youtube : meta;
|
||||||
target.tokensTotal++
|
target.tokensTotal++;
|
||||||
|
|
||||||
const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null
|
const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null;
|
||||||
if (expiresAt) {
|
if (expiresAt) {
|
||||||
if (expiresAt < now) {
|
if (expiresAt < now) {
|
||||||
target.tokensExpired++
|
target.tokensExpired++;
|
||||||
} else if (expiresAt < sevenDaysFromNow) {
|
} else if (expiresAt < sevenDaysFromNow) {
|
||||||
target.tokensExpiringSoon++
|
target.tokensExpiringSoon++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -279,58 +277,55 @@ export async function checkOAuthTokens(): Promise<{
|
||||||
return {
|
return {
|
||||||
metaOAuth: { status: getOAuthStatus(meta), ...meta },
|
metaOAuth: { status: getOAuthStatus(meta), ...meta },
|
||||||
youtubeOAuth: { status: getOAuthStatus(youtube), ...youtube },
|
youtubeOAuth: { status: getOAuthStatus(youtube), ...youtube },
|
||||||
}
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return { metaOAuth: errorStatus, youtubeOAuth: errorStatus }
|
return { metaOAuth: errorStatus, youtubeOAuth: errorStatus };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function checkCronJobs(): Promise<CronStatuses> {
|
export async function checkCronJobs(): Promise<CronStatuses> {
|
||||||
const unknownStatus: CronJobStatus = { lastRun: '', status: 'unknown' }
|
const unknownStatus: CronJobStatus = { lastRun: "", status: "unknown" };
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { getPayload } = await import('payload')
|
const { getPayload } = await import("payload");
|
||||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||||
|
|
||||||
async function checkCron(source: string): Promise<CronJobStatus> {
|
async function checkCron(source: string): Promise<CronJobStatus> {
|
||||||
try {
|
try {
|
||||||
const logs = await payload.find({
|
const logs = await payload.find({
|
||||||
collection: 'monitoring-logs',
|
collection: "monitoring-logs",
|
||||||
limit: 1,
|
limit: 1,
|
||||||
sort: '-createdAt',
|
sort: "-createdAt",
|
||||||
where: {
|
where: {
|
||||||
and: [
|
and: [{ source: { equals: "cron" } }, { message: { contains: source } }],
|
||||||
{ source: { equals: 'cron' } },
|
|
||||||
{ message: { contains: source } },
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
|
|
||||||
if (logs.docs.length === 0) return unknownStatus
|
if (logs.docs.length === 0) return unknownStatus;
|
||||||
|
|
||||||
const doc = logs.docs[0] as unknown as Record<string, unknown>
|
const doc = logs.docs[0] as unknown as Record<string, unknown>;
|
||||||
return {
|
return {
|
||||||
lastRun: doc.createdAt as string,
|
lastRun: doc.createdAt as string,
|
||||||
status: doc.level === 'error' ? 'failed' : 'ok',
|
status: doc.level === "error" ? "failed" : "ok",
|
||||||
}
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return unknownStatus
|
return unknownStatus;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const [communitySync, tokenRefresh, youtubeSync] = await Promise.all([
|
const [communitySync, tokenRefresh, youtubeSync] = await Promise.all([
|
||||||
checkCron('community-sync'),
|
checkCron("community-sync"),
|
||||||
checkCron('token-refresh'),
|
checkCron("token-refresh"),
|
||||||
checkCron('youtube'),
|
checkCron("youtube"),
|
||||||
])
|
]);
|
||||||
|
|
||||||
return { communitySync, tokenRefresh, youtubeSync }
|
return { communitySync, tokenRefresh, youtubeSync };
|
||||||
} catch {
|
} catch {
|
||||||
return {
|
return {
|
||||||
communitySync: unknownStatus,
|
communitySync: unknownStatus,
|
||||||
tokenRefresh: unknownStatus,
|
tokenRefresh: unknownStatus,
|
||||||
youtubeSync: unknownStatus,
|
youtubeSync: unknownStatus,
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -342,18 +337,19 @@ export async function checkCronJobs(): Promise<CronStatuses> {
|
||||||
* Collects all monitoring metrics in parallel. Individual check failures
|
* Collects all monitoring metrics in parallel. Individual check failures
|
||||||
* are isolated and return safe defaults instead of failing the whole collection.
|
* are isolated and return safe defaults instead of failing the whole collection.
|
||||||
*/
|
*/
|
||||||
export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>> {
|
export async function collectMetrics(): Promise<Omit<SystemMetrics, "timestamp">> {
|
||||||
const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] = await Promise.allSettled([
|
const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] =
|
||||||
checkSystemHealth(),
|
await Promise.allSettled([
|
||||||
checkRedis(),
|
checkSystemHealth(),
|
||||||
checkPostgresql(),
|
checkRedis(),
|
||||||
checkPgBouncer(),
|
checkPostgresql(),
|
||||||
checkSmtp(),
|
checkPgBouncer(),
|
||||||
checkOAuthTokens(),
|
checkSmtp(),
|
||||||
checkCronJobs(),
|
checkOAuthTokens(),
|
||||||
Promise.resolve(checkSecretsHealth()),
|
checkCronJobs(),
|
||||||
Promise.resolve(getSecurityMetricsSnapshot()),
|
Promise.resolve(checkSecretsHealth()),
|
||||||
])
|
Promise.resolve(getSecurityMetricsSnapshot()),
|
||||||
|
]);
|
||||||
|
|
||||||
// Load performance tracker lazily to avoid circular dependencies
|
// Load performance tracker lazily to avoid circular dependencies
|
||||||
let performance: PerformanceMetrics = {
|
let performance: PerformanceMetrics = {
|
||||||
|
|
@ -362,51 +358,51 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
||||||
p99ResponseTimeMs: 0,
|
p99ResponseTimeMs: 0,
|
||||||
errorRate: 0,
|
errorRate: 0,
|
||||||
requestsPerMinute: 0,
|
requestsPerMinute: 0,
|
||||||
}
|
};
|
||||||
try {
|
try {
|
||||||
// Dynamic path constructed at runtime to avoid Vite static analysis
|
// Dynamic path constructed at runtime to avoid Vite static analysis
|
||||||
// when performance-tracker module has not been created yet
|
// when performance-tracker module has not been created yet
|
||||||
const trackerPath = './performance-tracker'
|
const trackerPath = "./performance-tracker";
|
||||||
const mod = await import(/* @vite-ignore */ trackerPath)
|
const mod = await import(/* @vite-ignore */ trackerPath);
|
||||||
performance = mod.performanceTracker.getMetrics('1h')
|
performance = mod.performanceTracker.getMetrics("1h");
|
||||||
} catch {
|
} catch {
|
||||||
// Performance tracker not yet initialized
|
// Performance tracker not yet initialized
|
||||||
}
|
}
|
||||||
|
|
||||||
const defaultProcess: ProcessStatus = {
|
const defaultProcess: ProcessStatus = {
|
||||||
status: 'offline',
|
status: "offline",
|
||||||
pid: 0,
|
pid: 0,
|
||||||
memoryMB: 0,
|
memoryMB: 0,
|
||||||
uptimeSeconds: 0,
|
uptimeSeconds: 0,
|
||||||
restarts: 0,
|
restarts: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess)
|
const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess);
|
||||||
|
|
||||||
const oauthDefaults = {
|
const oauthDefaults = {
|
||||||
metaOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
metaOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||||
youtubeOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
youtubeOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||||
}
|
};
|
||||||
|
|
||||||
const cronDefaults: CronStatuses = {
|
const cronDefaults: CronStatuses = {
|
||||||
communitySync: { lastRun: '', status: 'unknown' },
|
communitySync: { lastRun: "", status: "unknown" },
|
||||||
tokenRefresh: { lastRun: '', status: 'unknown' },
|
tokenRefresh: { lastRun: "", status: "unknown" },
|
||||||
youtubeSync: { lastRun: '', status: 'unknown' },
|
youtubeSync: { lastRun: "", status: "unknown" },
|
||||||
}
|
};
|
||||||
|
|
||||||
const secretsDefaults: SecretsHealthStatus = {
|
const secretsDefaults: SecretsHealthStatus = {
|
||||||
status: 'critical',
|
status: "critical",
|
||||||
checkedAt: new Date().toISOString(),
|
checkedAt: new Date().toISOString(),
|
||||||
missing: [],
|
missing: [],
|
||||||
expiringSoon: [],
|
expiringSoon: [],
|
||||||
expired: [],
|
expired: [],
|
||||||
rotationOverdue: [],
|
rotationOverdue: [],
|
||||||
}
|
};
|
||||||
|
|
||||||
const securityEventsDefaults: SecurityMetricsStatus = {
|
const securityEventsDefaults: SecurityMetricsStatus = {
|
||||||
windowMs: 300000,
|
windowMs: 300000,
|
||||||
counters: [],
|
counters: [],
|
||||||
}
|
};
|
||||||
|
|
||||||
const systemDefaults: SystemHealth = {
|
const systemDefaults: SystemHealth = {
|
||||||
cpuUsagePercent: 0,
|
cpuUsagePercent: 0,
|
||||||
|
|
@ -419,21 +415,21 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
||||||
loadAvg1: 0,
|
loadAvg1: 0,
|
||||||
loadAvg5: 0,
|
loadAvg5: 0,
|
||||||
uptime: 0,
|
uptime: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
const oauthResult = settled(oauth, oauthDefaults)
|
const oauthResult = settled(oauth, oauthDefaults);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
system: settled(system, systemDefaults),
|
system: settled(system, systemDefaults),
|
||||||
services: {
|
services: {
|
||||||
payload: payloadProcess,
|
payload: payloadProcess,
|
||||||
queueWorker: queueWorkerProcess,
|
queueWorker: queueWorkerProcess,
|
||||||
postgresql: settled(postgresql, { status: 'offline', connections: 0, maxConnections: 50, latencyMs: -1 }),
|
postgresql: settled(postgresql, { status: "offline", connections: 0, maxConnections: 50, latencyMs: -1 }),
|
||||||
pgbouncer: settled(pgbouncer, { status: 'offline', activeConnections: 0, waitingClients: 0, poolSize: 0 }),
|
pgbouncer: settled(pgbouncer, { status: "offline", activeConnections: 0, waitingClients: 0, poolSize: 0 }),
|
||||||
redis: settled(redis, { status: 'offline', memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }),
|
redis: settled(redis, { status: "offline", memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }),
|
||||||
},
|
},
|
||||||
external: {
|
external: {
|
||||||
smtp: settled(smtp, { status: 'offline', lastCheck: new Date().toISOString(), responseTimeMs: -1 }),
|
smtp: settled(smtp, { status: "offline", lastCheck: new Date().toISOString(), responseTimeMs: -1 }),
|
||||||
metaOAuth: oauthResult.metaOAuth,
|
metaOAuth: oauthResult.metaOAuth,
|
||||||
youtubeOAuth: oauthResult.youtubeOAuth,
|
youtubeOAuth: oauthResult.youtubeOAuth,
|
||||||
cronJobs: settled(cronJobs, cronDefaults),
|
cronJobs: settled(cronJobs, cronDefaults),
|
||||||
|
|
@ -441,7 +437,7 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
||||||
securityEvents: settled(securityEvents, securityEventsDefaults),
|
securityEvents: settled(securityEvents, securityEventsDefaults),
|
||||||
},
|
},
|
||||||
performance,
|
performance,
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
@ -454,18 +450,18 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
||||||
*/
|
*/
|
||||||
function runPsql(args: string): string {
|
function runPsql(args: string): string {
|
||||||
return execSync(`psql ${args}`, {
|
return execSync(`psql ${args}`, {
|
||||||
encoding: 'utf-8',
|
encoding: "utf-8",
|
||||||
timeout: 5000,
|
timeout: 5000,
|
||||||
env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || '' },
|
env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || "" },
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function roundToOneDecimal(value: number): number {
|
function roundToOneDecimal(value: number): number {
|
||||||
return Math.round(value * 10) / 10
|
return Math.round(value * 10) / 10;
|
||||||
}
|
}
|
||||||
|
|
||||||
function roundToTwoDecimals(value: number): number {
|
function roundToTwoDecimals(value: number): number {
|
||||||
return Math.round(value * 100) / 100
|
return Math.round(value * 100) / 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -473,99 +469,95 @@ function roundToTwoDecimals(value: number): number {
|
||||||
* the fallback when the promise was rejected.
|
* the fallback when the promise was rejected.
|
||||||
*/
|
*/
|
||||||
function settled<T>(result: PromiseSettledResult<T>, fallback: T): T {
|
function settled<T>(result: PromiseSettledResult<T>, fallback: T): T {
|
||||||
return result.status === 'fulfilled' ? result.value : fallback
|
return result.status === "fulfilled" ? result.value : fallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getCpuUsage(): Promise<number> {
|
async function getCpuUsage(): Promise<number> {
|
||||||
try {
|
try {
|
||||||
const fs = await import('node:fs/promises')
|
const fs = await import("node:fs/promises");
|
||||||
const stat1 = await fs.readFile('/proc/stat', 'utf-8')
|
const stat1 = await fs.readFile("/proc/stat", "utf-8");
|
||||||
await new Promise((resolve) => setTimeout(resolve, 100))
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||||
const stat2 = await fs.readFile('/proc/stat', 'utf-8')
|
const stat2 = await fs.readFile("/proc/stat", "utf-8");
|
||||||
|
|
||||||
const parse = (data: string): { idle: number; total: number } => {
|
const parse = (data: string): { idle: number; total: number } => {
|
||||||
const line = data.split('\n')[0] // first line: cpu user nice system idle ...
|
const line = data.split("\n")[0]; // first line: cpu user nice system idle ...
|
||||||
const parts = line.split(/\s+/).slice(1).map(Number)
|
const parts = line.split(/\s+/).slice(1).map(Number);
|
||||||
const idle = parts[3] + (parts[4] || 0) // idle + iowait
|
const idle = parts[3] + (parts[4] || 0); // idle + iowait
|
||||||
const total = parts.reduce((a, b) => a + b, 0)
|
const total = parts.reduce((a, b) => a + b, 0);
|
||||||
return { idle, total }
|
return { idle, total };
|
||||||
}
|
};
|
||||||
|
|
||||||
const s1 = parse(stat1)
|
const s1 = parse(stat1);
|
||||||
const s2 = parse(stat2)
|
const s2 = parse(stat2);
|
||||||
const idleDiff = s2.idle - s1.idle
|
const idleDiff = s2.idle - s1.idle;
|
||||||
const totalDiff = s2.total - s1.total
|
const totalDiff = s2.total - s1.total;
|
||||||
|
|
||||||
if (totalDiff === 0) return 0
|
if (totalDiff === 0) return 0;
|
||||||
return ((totalDiff - idleDiff) / totalDiff) * 100
|
return ((totalDiff - idleDiff) / totalDiff) * 100;
|
||||||
} catch {
|
} catch {
|
||||||
// Fallback if /proc/stat is unavailable
|
// Fallback if /proc/stat is unavailable
|
||||||
const cpuCount = os.cpus().length
|
const cpuCount = os.cpus().length;
|
||||||
return (os.loadavg()[0] / cpuCount) * 100
|
return (os.loadavg()[0] / cpuCount) * 100;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getDiskUsage(): { diskUsedGB: number; diskTotalGB: number; diskUsagePercent: number } {
|
function getDiskUsage(): { diskUsedGB: number; diskTotalGB: number; diskUsagePercent: number } {
|
||||||
try {
|
try {
|
||||||
const output = execSync('df -B1 / | tail -1', { encoding: 'utf-8' })
|
const output = execSync("df -B1 / | tail -1", { encoding: "utf-8" });
|
||||||
const parts = output.trim().split(/\s+/)
|
const parts = output.trim().split(/\s+/);
|
||||||
// Format: filesystem 1B-blocks used available use% mountpoint
|
// Format: filesystem 1B-blocks used available use% mountpoint
|
||||||
const total = parseInt(parts[1], 10)
|
const total = parseInt(parts[1], 10);
|
||||||
const used = parseInt(parts[2], 10)
|
const used = parseInt(parts[2], 10);
|
||||||
return {
|
return {
|
||||||
diskTotalGB: roundToOneDecimal(total / 1024 / 1024 / 1024),
|
diskTotalGB: roundToOneDecimal(total / 1024 / 1024 / 1024),
|
||||||
diskUsedGB: roundToOneDecimal(used / 1024 / 1024 / 1024),
|
diskUsedGB: roundToOneDecimal(used / 1024 / 1024 / 1024),
|
||||||
diskUsagePercent: roundToOneDecimal((used / total) * 100),
|
diskUsagePercent: roundToOneDecimal((used / total) * 100),
|
||||||
}
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 }
|
return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getOAuthStatus(
|
function getOAuthStatus(counts: { tokensExpired: number; tokensExpiringSoon: number }): OAuthTokenStatus["status"] {
|
||||||
counts: { tokensExpired: number; tokensExpiringSoon: number },
|
if (counts.tokensExpired > 0) return "expired";
|
||||||
): OAuthTokenStatus['status'] {
|
if (counts.tokensExpiringSoon > 0) return "expiring_soon";
|
||||||
if (counts.tokensExpired > 0) return 'expired'
|
return "ok";
|
||||||
if (counts.tokensExpiringSoon > 0) return 'expiring_soon'
|
|
||||||
return 'ok'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Pm2Processes {
|
interface Pm2Processes {
|
||||||
payloadProcess: ProcessStatus
|
payloadProcess: ProcessStatus;
|
||||||
queueWorkerProcess: ProcessStatus
|
queueWorkerProcess: ProcessStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getPm2Processes(defaultProcess: ProcessStatus): Pm2Processes {
|
function getPm2Processes(defaultProcess: ProcessStatus): Pm2Processes {
|
||||||
let payloadProcess = defaultProcess
|
let payloadProcess = defaultProcess;
|
||||||
let queueWorkerProcess = defaultProcess
|
let queueWorkerProcess = defaultProcess;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const pm2Out = execSync('pm2 jlist', { encoding: 'utf-8', timeout: 5000 })
|
const pm2Out = execSync("pm2 jlist", { encoding: "utf-8", timeout: 5000 });
|
||||||
const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>>
|
const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>>;
|
||||||
|
|
||||||
for (const proc of pm2List) {
|
for (const proc of pm2List) {
|
||||||
const env = proc.pm2_env as Record<string, unknown> | undefined
|
const env = proc.pm2_env as Record<string, unknown> | undefined;
|
||||||
const monit = proc.monit as Record<string, number> | undefined
|
const monit = proc.monit as Record<string, number> | undefined;
|
||||||
|
|
||||||
const info: ProcessStatus = {
|
const info: ProcessStatus = {
|
||||||
status: env?.status === 'online' ? 'online' : 'offline',
|
status: env?.status === "online" ? "online" : "offline",
|
||||||
pid: (proc.pid as number) || 0,
|
pid: (proc.pid as number) || 0,
|
||||||
memoryMB: Math.round((monit?.memory || 0) / 1024 / 1024),
|
memoryMB: Math.round((monit?.memory || 0) / 1024 / 1024),
|
||||||
uptimeSeconds: env?.pm_uptime
|
uptimeSeconds: env?.pm_uptime ? Math.round((Date.now() - (env.pm_uptime as number)) / 1000) : 0,
|
||||||
? Math.round((Date.now() - (env.pm_uptime as number)) / 1000)
|
|
||||||
: 0,
|
|
||||||
restarts: (env?.restart_time as number) || 0,
|
restarts: (env?.restart_time as number) || 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
if (proc.name === 'payload') {
|
if (proc.name === "payload") {
|
||||||
payloadProcess = info
|
payloadProcess = info;
|
||||||
} else if (proc.name === 'queue-worker') {
|
} else if (proc.name === "queue-worker") {
|
||||||
queueWorkerProcess = info
|
queueWorkerProcess = info;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// PM2 not available
|
// PM2 not available
|
||||||
}
|
}
|
||||||
|
|
||||||
return { payloadProcess, queueWorkerProcess }
|
return { payloadProcess, queueWorkerProcess };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,14 +7,14 @@
|
||||||
* error rates, and throughput.
|
* error rates, and throughput.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { PerformanceEntry, PerformanceMetrics } from './types'
|
import type { PerformanceEntry, PerformanceMetrics } from "./types";
|
||||||
|
|
||||||
const PERIOD_MS: Record<string, number> = {
|
const PERIOD_MS: Record<string, number> = {
|
||||||
'1h': 3_600_000,
|
"1h": 3_600_000,
|
||||||
'6h': 21_600_000,
|
"6h": 21_600_000,
|
||||||
'24h': 86_400_000,
|
"24h": 86_400_000,
|
||||||
'7d': 604_800_000,
|
"7d": 604_800_000,
|
||||||
}
|
};
|
||||||
|
|
||||||
const EMPTY_METRICS: PerformanceMetrics = {
|
const EMPTY_METRICS: PerformanceMetrics = {
|
||||||
avgResponseTimeMs: 0,
|
avgResponseTimeMs: 0,
|
||||||
|
|
@ -22,17 +22,17 @@ const EMPTY_METRICS: PerformanceMetrics = {
|
||||||
p99ResponseTimeMs: 0,
|
p99ResponseTimeMs: 0,
|
||||||
errorRate: 0,
|
errorRate: 0,
|
||||||
requestsPerMinute: 0,
|
requestsPerMinute: 0,
|
||||||
}
|
};
|
||||||
|
|
||||||
export class PerformanceTracker {
|
export class PerformanceTracker {
|
||||||
private readonly buffer: PerformanceEntry[]
|
private readonly buffer: PerformanceEntry[];
|
||||||
private pointer: number = 0
|
private pointer: number = 0;
|
||||||
private count: number = 0
|
private count: number = 0;
|
||||||
private readonly capacity: number
|
private readonly capacity: number;
|
||||||
|
|
||||||
constructor(capacity: number = 10_000) {
|
constructor(capacity: number = 10_000) {
|
||||||
this.capacity = capacity
|
this.capacity = capacity;
|
||||||
this.buffer = new Array(capacity)
|
this.buffer = new Array(capacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
track(method: string, path: string, statusCode: number, durationMs: number): void {
|
track(method: string, path: string, statusCode: number, durationMs: number): void {
|
||||||
|
|
@ -42,40 +42,40 @@ export class PerformanceTracker {
|
||||||
path,
|
path,
|
||||||
statusCode,
|
statusCode,
|
||||||
durationMs,
|
durationMs,
|
||||||
}
|
};
|
||||||
this.pointer = (this.pointer + 1) % this.capacity
|
this.pointer = (this.pointer + 1) % this.capacity;
|
||||||
if (this.count < this.capacity) {
|
if (this.count < this.capacity) {
|
||||||
this.count++
|
this.count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
getMetrics(period: '1h' | '6h' | '24h' | '7d' = '1h'): PerformanceMetrics {
|
getMetrics(period: "1h" | "6h" | "24h" | "7d" = "1h"): PerformanceMetrics {
|
||||||
const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS['1h'])
|
const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS["1h"]);
|
||||||
|
|
||||||
const entries: PerformanceEntry[] = []
|
const entries: PerformanceEntry[] = [];
|
||||||
for (let i = 0; i < this.count; i++) {
|
for (let i = 0; i < this.count; i++) {
|
||||||
const entry = this.buffer[i]
|
const entry = this.buffer[i];
|
||||||
if (entry && entry.timestamp >= cutoff) {
|
if (entry && entry.timestamp >= cutoff) {
|
||||||
entries.push(entry)
|
entries.push(entry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entries.length === 0) {
|
if (entries.length === 0) {
|
||||||
return { ...EMPTY_METRICS }
|
return { ...EMPTY_METRICS };
|
||||||
}
|
}
|
||||||
|
|
||||||
const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b)
|
const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b);
|
||||||
|
|
||||||
const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length
|
const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length;
|
||||||
const p95 = percentile(durations, 0.95)
|
const p95 = percentile(durations, 0.95);
|
||||||
const p99 = percentile(durations, 0.99)
|
const p99 = percentile(durations, 0.99);
|
||||||
|
|
||||||
const errorCount = entries.filter((e) => e.statusCode >= 500).length
|
const errorCount = entries.filter((e) => e.statusCode >= 500).length;
|
||||||
const errorRate = errorCount / entries.length
|
const errorRate = errorCount / entries.length;
|
||||||
|
|
||||||
const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp))
|
const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp));
|
||||||
const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1)
|
const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1);
|
||||||
const requestsPerMinute = entries.length / windowMinutes
|
const requestsPerMinute = entries.length / windowMinutes;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
avgResponseTimeMs: Math.round(avg),
|
avgResponseTimeMs: Math.round(avg),
|
||||||
|
|
@ -83,14 +83,14 @@ export class PerformanceTracker {
|
||||||
p99ResponseTimeMs: p99,
|
p99ResponseTimeMs: p99,
|
||||||
errorRate: Math.round(errorRate * 1000) / 1000,
|
errorRate: Math.round(errorRate * 1000) / 1000,
|
||||||
requestsPerMinute: Math.round(requestsPerMinute * 10) / 10,
|
requestsPerMinute: Math.round(requestsPerMinute * 10) / 10,
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function percentile(sorted: number[], p: number): number {
|
function percentile(sorted: number[], p: number): number {
|
||||||
const index = Math.floor(sorted.length * p)
|
const index = Math.floor(sorted.length * p);
|
||||||
return sorted[Math.min(index, sorted.length - 1)]
|
return sorted[Math.min(index, sorted.length - 1)];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Singleton instance used across the application. */
|
/** Singleton instance used across the application. */
|
||||||
export const performanceTracker = new PerformanceTracker(10_000)
|
export const performanceTracker = new PerformanceTracker(10_000);
|
||||||
|
|
|
||||||
|
|
@ -6,62 +6,62 @@
|
||||||
* sie in MonitoringSnapshots. Evaluiert dabei Alert-Regeln.
|
* sie in MonitoringSnapshots. Evaluiert dabei Alert-Regeln.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { collectMetrics } from './monitoring-service'
|
import { collectMetrics } from "./monitoring-service";
|
||||||
import { AlertEvaluator } from './alert-evaluator'
|
import { AlertEvaluator } from "./alert-evaluator";
|
||||||
|
|
||||||
let interval: ReturnType<typeof setInterval> | null = null
|
let interval: ReturnType<typeof setInterval> | null = null;
|
||||||
const alertEvaluator = new AlertEvaluator()
|
const alertEvaluator = new AlertEvaluator();
|
||||||
|
|
||||||
/** Cached Payload instance — resolved once, reused on every tick. */
|
/** Cached Payload instance — resolved once, reused on every tick. */
|
||||||
let cachedPayload: any = null
|
let cachedPayload: any = null;
|
||||||
|
|
||||||
async function getPayloadInstance(): Promise<any> {
|
async function getPayloadInstance(): Promise<any> {
|
||||||
if (cachedPayload) return cachedPayload
|
if (cachedPayload) return cachedPayload;
|
||||||
const { getPayload } = await import('payload')
|
const { getPayload } = await import("payload");
|
||||||
const config = (await import(/* @vite-ignore */ '@payload-config')).default
|
const config = (await import(/* @vite-ignore */ "@payload-config")).default;
|
||||||
cachedPayload = await getPayload({ config })
|
cachedPayload = await getPayload({ config });
|
||||||
return cachedPayload
|
return cachedPayload;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function startSnapshotCollector(): Promise<void> {
|
export async function startSnapshotCollector(): Promise<void> {
|
||||||
const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || '60000', 10)
|
const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || "60000", 10);
|
||||||
console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`)
|
console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`);
|
||||||
|
|
||||||
// Run immediately once, then on interval
|
// Run immediately once, then on interval
|
||||||
await collectAndSave()
|
await collectAndSave();
|
||||||
|
|
||||||
interval = setInterval(async () => {
|
interval = setInterval(async () => {
|
||||||
await collectAndSave()
|
await collectAndSave();
|
||||||
}, INTERVAL)
|
}, INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function collectAndSave(): Promise<void> {
|
async function collectAndSave(): Promise<void> {
|
||||||
try {
|
try {
|
||||||
const payload = await getPayloadInstance()
|
const payload = await getPayloadInstance();
|
||||||
|
|
||||||
const metrics = await collectMetrics()
|
const metrics = await collectMetrics();
|
||||||
|
|
||||||
await (payload as any).create({
|
await (payload as any).create({
|
||||||
collection: 'monitoring-snapshots',
|
collection: "monitoring-snapshots",
|
||||||
data: {
|
data: {
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
...metrics,
|
...metrics,
|
||||||
},
|
},
|
||||||
})
|
});
|
||||||
|
|
||||||
// Evaluate alert rules against collected metrics
|
// Evaluate alert rules against collected metrics
|
||||||
await alertEvaluator.evaluateRules(payload as any, metrics)
|
await alertEvaluator.evaluateRules(payload as any, metrics);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('[SnapshotCollector] Error:', error)
|
console.error("[SnapshotCollector] Error:", error);
|
||||||
// Reset cache on error so next tick re-resolves
|
// Reset cache on error so next tick re-resolves
|
||||||
cachedPayload = null
|
cachedPayload = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function stopSnapshotCollector(): Promise<void> {
|
export async function stopSnapshotCollector(): Promise<void> {
|
||||||
if (interval) {
|
if (interval) {
|
||||||
clearInterval(interval)
|
clearInterval(interval);
|
||||||
interval = null
|
interval = null;
|
||||||
}
|
}
|
||||||
console.log('[SnapshotCollector] Stopped')
|
console.log("[SnapshotCollector] Stopped");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,192 +1,185 @@
|
||||||
// === System Health ===
|
// === System Health ===
|
||||||
export interface SystemHealth {
|
export interface SystemHealth {
|
||||||
cpuUsagePercent: number
|
cpuUsagePercent: number;
|
||||||
memoryUsedMB: number
|
memoryUsedMB: number;
|
||||||
memoryTotalMB: number
|
memoryTotalMB: number;
|
||||||
memoryUsagePercent: number
|
memoryUsagePercent: number;
|
||||||
diskUsedGB: number
|
diskUsedGB: number;
|
||||||
diskTotalGB: number
|
diskTotalGB: number;
|
||||||
diskUsagePercent: number
|
diskUsagePercent: number;
|
||||||
loadAvg1: number
|
loadAvg1: number;
|
||||||
loadAvg5: number
|
loadAvg5: number;
|
||||||
uptime: number // seconds
|
uptime: number; // seconds
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Service Statuses ===
|
// === Service Statuses ===
|
||||||
export type ServiceStatusType = 'online' | 'warning' | 'offline'
|
export type ServiceStatusType = "online" | "warning" | "offline";
|
||||||
|
|
||||||
export interface ProcessStatus {
|
export interface ProcessStatus {
|
||||||
status: ServiceStatusType
|
status: ServiceStatusType;
|
||||||
pid: number
|
pid: number;
|
||||||
memoryMB: number
|
memoryMB: number;
|
||||||
uptimeSeconds: number
|
uptimeSeconds: number;
|
||||||
restarts: number
|
restarts: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PostgresqlStatus {
|
export interface PostgresqlStatus {
|
||||||
status: ServiceStatusType
|
status: ServiceStatusType;
|
||||||
connections: number
|
connections: number;
|
||||||
maxConnections: number
|
maxConnections: number;
|
||||||
latencyMs: number
|
latencyMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PgBouncerStatus {
|
export interface PgBouncerStatus {
|
||||||
status: ServiceStatusType
|
status: ServiceStatusType;
|
||||||
activeConnections: number
|
activeConnections: number;
|
||||||
waitingClients: number
|
waitingClients: number;
|
||||||
poolSize: number
|
poolSize: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RedisStatus {
|
export interface RedisStatus {
|
||||||
status: ServiceStatusType
|
status: ServiceStatusType;
|
||||||
memoryUsedMB: number
|
memoryUsedMB: number;
|
||||||
connectedClients: number
|
connectedClients: number;
|
||||||
opsPerSec: number
|
opsPerSec: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ServiceStatuses {
|
export interface ServiceStatuses {
|
||||||
payload: ProcessStatus
|
payload: ProcessStatus;
|
||||||
queueWorker: ProcessStatus
|
queueWorker: ProcessStatus;
|
||||||
postgresql: PostgresqlStatus
|
postgresql: PostgresqlStatus;
|
||||||
pgbouncer: PgBouncerStatus
|
pgbouncer: PgBouncerStatus;
|
||||||
redis: RedisStatus
|
redis: RedisStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === External Statuses ===
|
// === External Statuses ===
|
||||||
export interface SmtpStatus {
|
export interface SmtpStatus {
|
||||||
status: ServiceStatusType
|
status: ServiceStatusType;
|
||||||
lastCheck: string // ISO date
|
lastCheck: string; // ISO date
|
||||||
responseTimeMs: number
|
responseTimeMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type OAuthStatusType = 'ok' | 'expiring_soon' | 'expired' | 'error'
|
export type OAuthStatusType = "ok" | "expiring_soon" | "expired" | "error";
|
||||||
|
|
||||||
export interface OAuthTokenStatus {
|
export interface OAuthTokenStatus {
|
||||||
status: OAuthStatusType
|
status: OAuthStatusType;
|
||||||
tokensTotal: number
|
tokensTotal: number;
|
||||||
tokensExpiringSoon: number
|
tokensExpiringSoon: number;
|
||||||
tokensExpired: number
|
tokensExpired: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface CronJobStatus {
|
export interface CronJobStatus {
|
||||||
lastRun: string // ISO date
|
lastRun: string; // ISO date
|
||||||
status: 'ok' | 'failed' | 'unknown'
|
status: "ok" | "failed" | "unknown";
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface CronStatuses {
|
export interface CronStatuses {
|
||||||
communitySync: CronJobStatus
|
communitySync: CronJobStatus;
|
||||||
tokenRefresh: CronJobStatus
|
tokenRefresh: CronJobStatus;
|
||||||
youtubeSync: CronJobStatus
|
youtubeSync: CronJobStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SecretExpiringSoon {
|
export interface SecretExpiringSoon {
|
||||||
name: string
|
name: string;
|
||||||
expiresAt: string
|
expiresAt: string;
|
||||||
daysRemaining: number
|
daysRemaining: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SecretExpired {
|
export interface SecretExpired {
|
||||||
name: string
|
name: string;
|
||||||
expiresAt: string
|
expiresAt: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SecretRotationOverdue {
|
export interface SecretRotationOverdue {
|
||||||
name: string
|
name: string;
|
||||||
rotatedAt: string
|
rotatedAt: string;
|
||||||
ageDays: number
|
ageDays: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SecretsHealthStatus {
|
export interface SecretsHealthStatus {
|
||||||
status: 'ok' | 'warning' | 'critical'
|
status: "ok" | "warning" | "critical";
|
||||||
checkedAt: string
|
checkedAt: string;
|
||||||
missing: string[]
|
missing: string[];
|
||||||
expiringSoon: SecretExpiringSoon[]
|
expiringSoon: SecretExpiringSoon[];
|
||||||
expired: SecretExpired[]
|
expired: SecretExpired[];
|
||||||
rotationOverdue: SecretRotationOverdue[]
|
rotationOverdue: SecretRotationOverdue[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SecurityMetricsStatus {
|
export interface SecurityMetricsStatus {
|
||||||
windowMs: number
|
windowMs: number;
|
||||||
counters: Array<{
|
counters: Array<{
|
||||||
eventType: string
|
eventType: string;
|
||||||
count: number
|
count: number;
|
||||||
windowStart: string
|
windowStart: string;
|
||||||
}>
|
}>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExternalStatuses {
|
export interface ExternalStatuses {
|
||||||
smtp: SmtpStatus
|
smtp: SmtpStatus;
|
||||||
metaOAuth: OAuthTokenStatus
|
metaOAuth: OAuthTokenStatus;
|
||||||
youtubeOAuth: OAuthTokenStatus
|
youtubeOAuth: OAuthTokenStatus;
|
||||||
cronJobs: CronStatuses
|
cronJobs: CronStatuses;
|
||||||
secrets: SecretsHealthStatus
|
secrets: SecretsHealthStatus;
|
||||||
securityEvents: SecurityMetricsStatus
|
securityEvents: SecurityMetricsStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Performance ===
|
// === Performance ===
|
||||||
export interface PerformanceMetrics {
|
export interface PerformanceMetrics {
|
||||||
avgResponseTimeMs: number
|
avgResponseTimeMs: number;
|
||||||
p95ResponseTimeMs: number
|
p95ResponseTimeMs: number;
|
||||||
p99ResponseTimeMs: number
|
p99ResponseTimeMs: number;
|
||||||
errorRate: number // 0-1
|
errorRate: number; // 0-1
|
||||||
requestsPerMinute: number
|
requestsPerMinute: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Full Snapshot ===
|
// === Full Snapshot ===
|
||||||
export interface SystemMetrics {
|
export interface SystemMetrics {
|
||||||
timestamp: string // ISO date
|
timestamp: string; // ISO date
|
||||||
system: SystemHealth
|
system: SystemHealth;
|
||||||
services: ServiceStatuses
|
services: ServiceStatuses;
|
||||||
external: ExternalStatuses
|
external: ExternalStatuses;
|
||||||
performance: PerformanceMetrics
|
performance: PerformanceMetrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === SSE Events (discriminated union) ===
|
// === SSE Events (discriminated union) ===
|
||||||
export type MonitoringEvent =
|
export type MonitoringEvent =
|
||||||
| { type: 'health'; data: SystemHealth }
|
| { type: "health"; data: SystemHealth }
|
||||||
| { type: 'service'; data: Partial<ServiceStatuses> }
|
| { type: "service"; data: Partial<ServiceStatuses> }
|
||||||
| { type: 'alert'; data: AlertEvent }
|
| { type: "alert"; data: AlertEvent }
|
||||||
| { type: 'log'; data: LogEvent }
|
| { type: "log"; data: LogEvent }
|
||||||
| { type: 'performance'; data: PerformanceMetrics }
|
| { type: "performance"; data: PerformanceMetrics };
|
||||||
|
|
||||||
export interface AlertEvent {
|
export interface AlertEvent {
|
||||||
id: string
|
id: string;
|
||||||
ruleId: string
|
ruleId: string;
|
||||||
metric: string
|
metric: string;
|
||||||
value: number
|
value: number;
|
||||||
threshold: number
|
threshold: number;
|
||||||
severity: AlertSeverity
|
severity: AlertSeverity;
|
||||||
message: string
|
message: string;
|
||||||
timestamp: string
|
timestamp: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LogEvent {
|
export interface LogEvent {
|
||||||
id: string
|
id: string;
|
||||||
level: LogLevel
|
level: LogLevel;
|
||||||
source: LogSource
|
source: LogSource;
|
||||||
message: string
|
message: string;
|
||||||
timestamp: string
|
timestamp: string;
|
||||||
context?: Record<string, unknown>
|
context?: Record<string, unknown>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// === Enums as union types ===
|
// === Enums as union types ===
|
||||||
export type AlertCondition = 'gt' | 'lt' | 'eq' | 'gte' | 'lte'
|
export type AlertCondition = "gt" | "lt" | "eq" | "gte" | "lte";
|
||||||
export type AlertSeverity = 'warning' | 'error' | 'critical'
|
export type AlertSeverity = "warning" | "error" | "critical";
|
||||||
export type LogLevel = 'debug' | 'info' | 'warn' | 'error' | 'fatal'
|
export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
|
||||||
export type LogSource =
|
export type LogSource = "payload" | "queue-worker" | "cron" | "email" | "oauth" | "sync" | "security";
|
||||||
| 'payload'
|
|
||||||
| 'queue-worker'
|
|
||||||
| 'cron'
|
|
||||||
| 'email'
|
|
||||||
| 'oauth'
|
|
||||||
| 'sync'
|
|
||||||
| 'security'
|
|
||||||
|
|
||||||
// === Performance Tracker Entry ===
|
// === Performance Tracker Entry ===
|
||||||
export interface PerformanceEntry {
|
export interface PerformanceEntry {
|
||||||
timestamp: number // Date.now()
|
timestamp: number; // Date.now()
|
||||||
method: string
|
method: string;
|
||||||
path: string
|
path: string;
|
||||||
statusCode: number
|
statusCode: number;
|
||||||
durationMs: number
|
durationMs: number;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue