fix(ci): increase build heap size and format monitoring files

Build was OOM-ing in CI with default Node heap limit. Added
NODE_OPTIONS with 4GB heap. Also ran Prettier on monitoring files.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Martin Porwoll 2026-02-17 11:58:08 +00:00
parent 884d33c0ae
commit 037835d1de
7 changed files with 481 additions and 510 deletions

View file

@ -212,6 +212,7 @@ jobs:
- name: Build application - name: Build application
run: pnpm build run: pnpm build
env: env:
NODE_OPTIONS: '--max-old-space-size=4096'
# Minimal env vars for build # Minimal env vars for build
PAYLOAD_SECRET: build-secret-placeholder PAYLOAD_SECRET: build-secret-placeholder
DATABASE_URI: postgresql://placeholder:placeholder@localhost:5432/placeholder DATABASE_URI: postgresql://placeholder:placeholder@localhost:5432/placeholder

View file

@ -6,8 +6,8 @@
* cooldown periods, and multi-channel alert dispatch. * cooldown periods, and multi-channel alert dispatch.
*/ */
import type { Payload } from 'payload' import type { Payload } from "payload";
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types' import type { AlertCondition, AlertSeverity, SystemMetrics } from "./types";
// ============================================================================ // ============================================================================
// Pure Functions // Pure Functions
@ -18,40 +18,36 @@ import type { AlertCondition, AlertSeverity, SystemMetrics } from './types'
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92 * Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
*/ */
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined { export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
const parts = path.split('.') const parts = path.split(".");
let current: unknown = metrics let current: unknown = metrics;
for (const part of parts) { for (const part of parts) {
if (current === null || current === undefined || typeof current !== 'object') { if (current === null || current === undefined || typeof current !== "object") {
return undefined return undefined;
} }
current = (current as Record<string, unknown>)[part] current = (current as Record<string, unknown>)[part];
} }
return typeof current === 'number' ? current : undefined return typeof current === "number" ? current : undefined;
} }
/** /**
* Evaluates a condition against a value and threshold. * Evaluates a condition against a value and threshold.
*/ */
export function evaluateCondition( export function evaluateCondition(condition: AlertCondition, value: number, threshold: number): boolean {
condition: AlertCondition,
value: number,
threshold: number,
): boolean {
switch (condition) { switch (condition) {
case 'gt': case "gt":
return value > threshold return value > threshold;
case 'lt': case "lt":
return value < threshold return value < threshold;
case 'eq': case "eq":
return value === threshold return value === threshold;
case 'gte': case "gte":
return value >= threshold return value >= threshold;
case 'lte': case "lte":
return value <= threshold return value <= threshold;
default: default:
return false return false;
} }
} }
@ -60,28 +56,28 @@ export function evaluateCondition(
// ============================================================================ // ============================================================================
interface AlertRule { interface AlertRule {
id: number id: number;
name: string name: string;
metric: string metric: string;
condition: AlertCondition condition: AlertCondition;
threshold: number threshold: number;
severity: AlertSeverity severity: AlertSeverity;
channels: Array<'email' | 'slack' | 'discord'> channels: Array<"email" | "slack" | "discord">;
recipients?: { recipients?: {
emails?: Array<{ email: string }> emails?: Array<{ email: string }>;
slackWebhook?: string slackWebhook?: string;
discordWebhook?: string discordWebhook?: string;
} };
cooldownMinutes: number cooldownMinutes: number;
enabled: boolean enabled: boolean;
} }
// Maps AlertSeverity to the AlertLevel expected by alert-service // Maps AlertSeverity to the AlertLevel expected by alert-service
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = { const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
warning: 'warning', warning: "warning",
error: 'error', error: "error",
critical: 'critical', critical: "critical",
} };
// ============================================================================ // ============================================================================
// AlertEvaluator Class // AlertEvaluator Class
@ -89,58 +85,52 @@ const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
export class AlertEvaluator { export class AlertEvaluator {
/** Tracks last fire time per rule to enforce cooldown */ /** Tracks last fire time per rule to enforce cooldown */
private cooldownMap: Map<string, number> = new Map() private cooldownMap: Map<string, number> = new Map();
/** /**
* Returns true if the rule should fire (not in cooldown). * Returns true if the rule should fire (not in cooldown).
*/ */
shouldFire(ruleId: string, cooldownMinutes: number): boolean { shouldFire(ruleId: string, cooldownMinutes: number): boolean {
const lastFired = this.cooldownMap.get(ruleId) const lastFired = this.cooldownMap.get(ruleId);
if (lastFired) { if (lastFired) {
const elapsedMinutes = (Date.now() - lastFired) / 60_000 const elapsedMinutes = (Date.now() - lastFired) / 60_000;
if (elapsedMinutes < cooldownMinutes) return false if (elapsedMinutes < cooldownMinutes) return false;
} }
return true return true;
} }
/** Record that a rule fired successfully. */ /** Record that a rule fired successfully. */
recordFired(ruleId: string): void { recordFired(ruleId: string): void {
this.cooldownMap.set(ruleId, Date.now()) this.cooldownMap.set(ruleId, Date.now());
} }
/** /**
* Evaluates all enabled rules against current metrics. * Evaluates all enabled rules against current metrics.
* Fires alerts for rules that match and are not in cooldown. * Fires alerts for rules that match and are not in cooldown.
*/ */
async evaluateRules( async evaluateRules(payload: Payload, metrics: Omit<SystemMetrics, "timestamp">): Promise<void> {
payload: Payload,
metrics: Omit<SystemMetrics, 'timestamp'>,
): Promise<void> {
try { try {
const rules = await payload.find({ const rules = await payload.find({
collection: 'monitoring-alert-rules', collection: "monitoring-alert-rules",
where: { enabled: { equals: true } }, where: { enabled: { equals: true } },
limit: 100, limit: 100,
}) });
for (const doc of rules.docs) { for (const doc of rules.docs) {
const rule = doc as unknown as AlertRule const rule = doc as unknown as AlertRule;
const value = getMetricValue( const value = getMetricValue(metrics as unknown as Record<string, unknown>, rule.metric);
metrics as unknown as Record<string, unknown>, if (value === undefined) continue;
rule.metric,
)
if (value === undefined) continue
if (evaluateCondition(rule.condition, value, rule.threshold)) { if (evaluateCondition(rule.condition, value, rule.threshold)) {
const ruleKey = String(rule.id) const ruleKey = String(rule.id);
if (this.shouldFire(ruleKey, rule.cooldownMinutes)) { if (this.shouldFire(ruleKey, rule.cooldownMinutes)) {
await this.dispatchAlert(payload, rule, value) await this.dispatchAlert(payload, rule, value);
this.recordFired(ruleKey) this.recordFired(ruleKey);
} }
} }
} }
} catch (error) { } catch (error) {
console.error('[AlertEvaluator] Error evaluating rules:', error) console.error("[AlertEvaluator] Error evaluating rules:", error);
} }
} }
@ -149,11 +139,11 @@ export class AlertEvaluator {
* via the existing alert service. * via the existing alert service.
*/ */
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> { private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})` const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`;
try { try {
await payload.create({ await payload.create({
collection: 'monitoring-alert-history', collection: "monitoring-alert-history",
data: { data: {
rule: rule.id, rule: rule.id,
metric: rule.metric, metric: rule.metric,
@ -163,13 +153,13 @@ export class AlertEvaluator {
message, message,
channelsSent: rule.channels, channelsSent: rule.channels,
}, },
}) });
// Try to send via existing alert service // Try to send via existing alert service
try { try {
const { sendAlert } = await import('../alerting/alert-service.js') const { sendAlert } = await import("../alerting/alert-service.js");
await sendAlert(payload, { await sendAlert(payload, {
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical', level: SEVERITY_TO_LEVEL[rule.severity] as "warning" | "error" | "critical",
title: `[${rule.severity.toUpperCase()}] ${rule.name}`, title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
message, message,
details: { details: {
@ -178,13 +168,13 @@ export class AlertEvaluator {
threshold: rule.threshold, threshold: rule.threshold,
condition: rule.condition, condition: rule.condition,
}, },
}) });
} catch { } catch {
// Alert service not available, history record is sufficient // Alert service not available, history record is sufficient
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`) console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`);
} }
} catch (error) { } catch (error) {
console.error('[AlertEvaluator] Error dispatching alert:', error) console.error("[AlertEvaluator] Error dispatching alert:", error);
} }
} }
} }

View file

@ -5,7 +5,7 @@
* Falls back to console output when Payload is not yet initialized. * Falls back to console output when Payload is not yet initialized.
*/ */
import type { LogLevel, LogSource } from './types' import type { LogLevel, LogSource } from "./types";
const LOG_LEVELS: Record<LogLevel, number> = { const LOG_LEVELS: Record<LogLevel, number> = {
debug: 0, debug: 0,
@ -13,63 +13,58 @@ const LOG_LEVELS: Record<LogLevel, number> = {
warn: 2, warn: 2,
error: 3, error: 3,
fatal: 4, fatal: 4,
} };
function getMinLevel(): LogLevel { function getMinLevel(): LogLevel {
return (process.env.MONITORING_LOG_LEVEL as LogLevel) || 'info' return (process.env.MONITORING_LOG_LEVEL as LogLevel) || "info";
} }
function shouldLog(level: LogLevel): boolean { function shouldLog(level: LogLevel): boolean {
return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()] return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()];
} }
export interface LogContext { export interface LogContext {
requestId?: string requestId?: string;
userId?: number userId?: number;
tenant?: number tenant?: number;
duration?: number duration?: number;
[key: string]: unknown [key: string]: unknown;
} }
export interface MonitoringLoggerInstance { export interface MonitoringLoggerInstance {
debug(message: string, context?: LogContext): void debug(message: string, context?: LogContext): void;
info(message: string, context?: LogContext): void info(message: string, context?: LogContext): void;
warn(message: string, context?: LogContext): void warn(message: string, context?: LogContext): void;
error(message: string, context?: LogContext): void error(message: string, context?: LogContext): void;
fatal(message: string, context?: LogContext): void fatal(message: string, context?: LogContext): void;
} }
/** Cached Payload instance — resolved once, reused for all subsequent writes. */ /** Cached Payload instance — resolved once, reused for all subsequent writes. */
let cachedPayload: any = null let cachedPayload: any = null;
async function getPayloadInstance(): Promise<any> { async function getPayloadInstance(): Promise<any> {
if (cachedPayload) return cachedPayload if (cachedPayload) return cachedPayload;
const { getPayload } = await import('payload') const { getPayload } = await import("payload");
const config = (await import(/* @vite-ignore */ '@payload-config')).default const config = (await import(/* @vite-ignore */ "@payload-config")).default;
cachedPayload = await getPayload({ config }) cachedPayload = await getPayload({ config });
return cachedPayload return cachedPayload;
} }
/** Reset cached instance (used in tests). */ /** Reset cached instance (used in tests). */
export function _resetPayloadCache(): void { export function _resetPayloadCache(): void {
cachedPayload = null cachedPayload = null;
} }
async function writeLog( async function writeLog(source: LogSource, level: LogLevel, message: string, context?: LogContext): Promise<void> {
source: LogSource, if (!shouldLog(level)) return;
level: LogLevel,
message: string,
context?: LogContext,
): Promise<void> {
if (!shouldLog(level)) return
try { try {
const payload = await getPayloadInstance() const payload = await getPayloadInstance();
const { requestId, userId, tenant, duration, ...rest } = context || {} const { requestId, userId, tenant, duration, ...rest } = context || {};
await payload.create({ await payload.create({
collection: 'monitoring-logs', collection: "monitoring-logs",
data: { data: {
level, level,
source, source,
@ -80,12 +75,12 @@ async function writeLog(
tenant, tenant,
duration, duration,
}, },
}) });
} catch { } catch {
// Fallback to console if Payload is not yet initialized // Fallback to console if Payload is not yet initialized
cachedPayload = null cachedPayload = null;
const prefix = `[${source}][${level.toUpperCase()}]` const prefix = `[${source}][${level.toUpperCase()}]`;
console.log(prefix, message, context || '') console.log(prefix, message, context || "");
} }
} }
@ -94,16 +89,16 @@ export function createMonitoringLogger(source: LogSource): MonitoringLoggerInsta
return function logMessage(message: string, context?: LogContext): void { return function logMessage(message: string, context?: LogContext): void {
// Fire-and-forget -- don't block the caller // Fire-and-forget -- don't block the caller
writeLog(source, level, message, context).catch(function onError(err) { writeLog(source, level, message, context).catch(function onError(err) {
console.error(`[MonitoringLogger] Failed to write ${level} log:`, err) console.error(`[MonitoringLogger] Failed to write ${level} log:`, err);
}) });
} };
} }
return { return {
debug: log('debug'), debug: log("debug"),
info: log('info'), info: log("info"),
warn: log('warn'), warn: log("warn"),
error: log('error'), error: log("error"),
fatal: log('fatal'), fatal: log("fatal"),
} };
} }

View file

@ -5,8 +5,8 @@
* dependency checks. Used by the monitoring dashboard and snapshot collector. * dependency checks. Used by the monitoring dashboard and snapshot collector.
*/ */
import os from 'node:os' import os from "node:os";
import { execSync } from 'node:child_process' import { execSync } from "node:child_process";
import type { import type {
SystemHealth, SystemHealth,
ProcessStatus, ProcessStatus,
@ -21,9 +21,9 @@ import type {
SecurityMetricsStatus, SecurityMetricsStatus,
PerformanceMetrics, PerformanceMetrics,
SystemMetrics, SystemMetrics,
} from './types' } from "./types";
import { checkSecretsHealth } from '../security/secrets-health' import { checkSecretsHealth } from "../security/secrets-health";
import { getSecurityMetricsSnapshot } from '../security/security-observability' import { getSecurityMetricsSnapshot } from "../security/security-observability";
// ============================================================================ // ============================================================================
// System Health // System Health
@ -34,15 +34,15 @@ import { getSecurityMetricsSnapshot } from '../security/security-observability'
* CPU usage is calculated by sampling /proc/stat twice with 100ms delay. * CPU usage is calculated by sampling /proc/stat twice with 100ms delay.
*/ */
export async function checkSystemHealth(): Promise<SystemHealth> { export async function checkSystemHealth(): Promise<SystemHealth> {
const cpuUsagePercent = await getCpuUsage() const cpuUsagePercent = await getCpuUsage();
const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024) const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024);
const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024) const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024);
const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100) const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100);
const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage() const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage();
const [loadAvg1, loadAvg5] = os.loadavg() const [loadAvg1, loadAvg5] = os.loadavg();
return { return {
cpuUsagePercent: roundToOneDecimal(cpuUsagePercent), cpuUsagePercent: roundToOneDecimal(cpuUsagePercent),
@ -55,7 +55,7 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
loadAvg1: roundToTwoDecimals(loadAvg1), loadAvg1: roundToTwoDecimals(loadAvg1),
loadAvg5: roundToTwoDecimals(loadAvg5), loadAvg5: roundToTwoDecimals(loadAvg5),
uptime: Math.round(os.uptime()), uptime: Math.round(os.uptime()),
} };
} }
// ============================================================================ // ============================================================================
@ -64,214 +64,212 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
export async function checkRedis(): Promise<RedisStatus> { export async function checkRedis(): Promise<RedisStatus> {
const offlineStatus: RedisStatus = { const offlineStatus: RedisStatus = {
status: 'offline', status: "offline",
memoryUsedMB: 0, memoryUsedMB: 0,
connectedClients: 0, connectedClients: 0,
opsPerSec: 0, opsPerSec: 0,
} };
try { try {
const { getRedisClient } = await import('../redis.js') const { getRedisClient } = await import("../redis.js");
const client = getRedisClient() const client = getRedisClient();
if (!client) return offlineStatus if (!client) return offlineStatus;
const info = await client.info() const info = await client.info();
const getVal = (key: string): number => { const getVal = (key: string): number => {
const match = info.match(new RegExp(`${key}:(\\d+)`)) const match = info.match(new RegExp(`${key}:(\\d+)`));
return match ? parseInt(match[1], 10) : 0 return match ? parseInt(match[1], 10) : 0;
} };
return { return {
status: 'online', status: "online",
memoryUsedMB: Math.round(getVal('used_memory') / 1024 / 1024), memoryUsedMB: Math.round(getVal("used_memory") / 1024 / 1024),
connectedClients: getVal('connected_clients'), connectedClients: getVal("connected_clients"),
opsPerSec: getVal('instantaneous_ops_per_sec'), opsPerSec: getVal("instantaneous_ops_per_sec"),
} };
} catch { } catch {
return offlineStatus return offlineStatus;
} }
} }
export async function checkPostgresql(): Promise<PostgresqlStatus> { export async function checkPostgresql(): Promise<PostgresqlStatus> {
const offlineStatus: PostgresqlStatus = { const offlineStatus: PostgresqlStatus = {
status: 'offline', status: "offline",
connections: 0, connections: 0,
maxConnections: 50, maxConnections: 50,
latencyMs: -1, latencyMs: -1,
} };
try { try {
const { getPayload } = await import('payload') const { getPayload } = await import("payload");
const payload = await getPayload({ config: (await import('@payload-config')).default }) const payload = await getPayload({ config: (await import("@payload-config")).default });
const start = Date.now() const start = Date.now();
await payload.find({ collection: 'users', limit: 0 }) await payload.find({ collection: "users", limit: 0 });
const latencyMs = Date.now() - start const latencyMs = Date.now() - start;
let connections = 0 let connections = 0;
let maxConnections = 50 let maxConnections = 50;
try { try {
const connResult = runPsql( const connResult = runPsql(
'-h 10.10.181.101 -U payload -d payload_db -t -c "SELECT count(*) FROM pg_stat_activity WHERE datname = \'payload_db\'"', "-h 10.10.181.101 -U payload -d payload_db -t -c \"SELECT count(*) FROM pg_stat_activity WHERE datname = 'payload_db'\"",
) );
connections = parseInt(connResult.trim(), 10) || 0 connections = parseInt(connResult.trim(), 10) || 0;
const maxResult = runPsql( const maxResult = runPsql('-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"');
'-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"', maxConnections = parseInt(maxResult.trim(), 10) || 50;
)
maxConnections = parseInt(maxResult.trim(), 10) || 50
} catch { } catch {
// psql unavailable -- latency check already proves connectivity // psql unavailable -- latency check already proves connectivity
} }
return { return {
status: latencyMs < 1000 ? 'online' : 'warning', status: latencyMs < 1000 ? "online" : "warning",
connections, connections,
maxConnections, maxConnections,
latencyMs, latencyMs,
} };
} catch { } catch {
return offlineStatus return offlineStatus;
} }
} }
export async function checkPgBouncer(): Promise<PgBouncerStatus> { export async function checkPgBouncer(): Promise<PgBouncerStatus> {
const offlineStatus: PgBouncerStatus = { const offlineStatus: PgBouncerStatus = {
status: 'offline', status: "offline",
activeConnections: 0, activeConnections: 0,
waitingClients: 0, waitingClients: 0,
poolSize: 0, poolSize: 0,
} };
try { try {
const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"') const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"');
// SHOW POOLS columns: database | user | cl_active | cl_waiting | sv_active | sv_idle | pool_size | ... // SHOW POOLS columns: database | user | cl_active | cl_waiting | sv_active | sv_idle | pool_size | ...
const lines = output const lines = output
.trim() .trim()
.split('\n') .split("\n")
.filter((l) => l.includes('payload')) .filter((l) => l.includes("payload"));
let activeConnections = 0 let activeConnections = 0;
let waitingClients = 0 let waitingClients = 0;
let poolSize = 20 let poolSize = 20;
for (const line of lines) { for (const line of lines) {
const parts = line.split('|').map((s) => s.trim()) const parts = line.split("|").map((s) => s.trim());
activeConnections += parseInt(parts[2], 10) || 0 activeConnections += parseInt(parts[2], 10) || 0;
waitingClients += parseInt(parts[3], 10) || 0 waitingClients += parseInt(parts[3], 10) || 0;
poolSize = parseInt(parts[6], 10) || 20 poolSize = parseInt(parts[6], 10) || 20;
} }
return { status: 'online', activeConnections, waitingClients, poolSize } return { status: "online", activeConnections, waitingClients, poolSize };
} catch { } catch {
return offlineStatus return offlineStatus;
} }
} }
export interface QueueCounts { export interface QueueCounts {
waiting: number waiting: number;
active: number active: number;
completed: number completed: number;
failed: number failed: number;
} }
export async function checkQueues(): Promise<Record<string, QueueCounts>> { export async function checkQueues(): Promise<Record<string, QueueCounts>> {
try { try {
const { Queue } = await import('bullmq') const { Queue } = await import("bullmq");
const { getQueueRedisConnection } = await import('../queue/queue-service.js') const { getQueueRedisConnection } = await import("../queue/queue-service.js");
const connection = getQueueRedisConnection() const connection = getQueueRedisConnection();
// Queue names matching QUEUE_NAMES in queue-service.ts // Queue names matching QUEUE_NAMES in queue-service.ts
const queueNames = ['email', 'pdf', 'cleanup', 'youtube-upload'] const queueNames = ["email", "pdf", "cleanup", "youtube-upload"];
const results: Record<string, QueueCounts> = {} const results: Record<string, QueueCounts> = {};
for (const name of queueNames) { for (const name of queueNames) {
try { try {
const queue = new Queue(name, { connection }) const queue = new Queue(name, { connection });
const counts = await queue.getJobCounts() const counts = await queue.getJobCounts();
results[name] = { results[name] = {
waiting: counts.waiting || 0, waiting: counts.waiting || 0,
active: counts.active || 0, active: counts.active || 0,
completed: counts.completed || 0, completed: counts.completed || 0,
failed: counts.failed || 0, failed: counts.failed || 0,
} };
await queue.close() await queue.close();
} catch { } catch {
results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 } results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 };
} }
} }
return results return results;
} catch { } catch {
return {} return {};
} }
} }
export async function checkSmtp(): Promise<SmtpStatus> { export async function checkSmtp(): Promise<SmtpStatus> {
const now = new Date().toISOString() const now = new Date().toISOString();
try { try {
const nodemailer = await import('nodemailer') const nodemailer = await import("nodemailer");
const transporter = nodemailer.createTransport({ const transporter = nodemailer.createTransport({
host: process.env.SMTP_HOST, host: process.env.SMTP_HOST,
port: parseInt(process.env.SMTP_PORT || '587', 10), port: parseInt(process.env.SMTP_PORT || "587", 10),
secure: process.env.SMTP_SECURE === 'true', secure: process.env.SMTP_SECURE === "true",
auth: { auth: {
user: process.env.SMTP_USER, user: process.env.SMTP_USER,
pass: process.env.SMTP_PASS, pass: process.env.SMTP_PASS,
}, },
}) });
const start = Date.now() const start = Date.now();
await transporter.verify() await transporter.verify();
const responseTimeMs = Date.now() - start const responseTimeMs = Date.now() - start;
return { status: 'online', lastCheck: now, responseTimeMs } return { status: "online", lastCheck: now, responseTimeMs };
} catch { } catch {
return { status: 'offline', lastCheck: now, responseTimeMs: -1 } return { status: "offline", lastCheck: now, responseTimeMs: -1 };
} }
} }
export async function checkOAuthTokens(): Promise<{ export async function checkOAuthTokens(): Promise<{
metaOAuth: OAuthTokenStatus metaOAuth: OAuthTokenStatus;
youtubeOAuth: OAuthTokenStatus youtubeOAuth: OAuthTokenStatus;
}> { }> {
const errorStatus: OAuthTokenStatus = { const errorStatus: OAuthTokenStatus = {
status: 'error', status: "error",
tokensTotal: 0, tokensTotal: 0,
tokensExpiringSoon: 0, tokensExpiringSoon: 0,
tokensExpired: 0, tokensExpired: 0,
} };
try { try {
const { getPayload } = await import('payload') const { getPayload } = await import("payload");
const payload = await getPayload({ config: (await import('@payload-config')).default }) const payload = await getPayload({ config: (await import("@payload-config")).default });
const accounts = await payload.find({ const accounts = await payload.find({
collection: 'social-accounts', collection: "social-accounts",
limit: 100, limit: 100,
where: { status: { equals: 'connected' } }, where: { status: { equals: "connected" } },
}) });
const sevenDaysFromNow = new Date() const sevenDaysFromNow = new Date();
sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7) sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7);
const now = new Date() const now = new Date();
const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 } const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 } const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
for (const account of accounts.docs) { for (const account of accounts.docs) {
const doc = account as unknown as Record<string, unknown> const doc = account as unknown as Record<string, unknown>;
const target = doc.platform === 'youtube' ? youtube : meta const target = doc.platform === "youtube" ? youtube : meta;
target.tokensTotal++ target.tokensTotal++;
const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null;
if (expiresAt) { if (expiresAt) {
if (expiresAt < now) { if (expiresAt < now) {
target.tokensExpired++ target.tokensExpired++;
} else if (expiresAt < sevenDaysFromNow) { } else if (expiresAt < sevenDaysFromNow) {
target.tokensExpiringSoon++ target.tokensExpiringSoon++;
} }
} }
} }
@ -279,58 +277,55 @@ export async function checkOAuthTokens(): Promise<{
return { return {
metaOAuth: { status: getOAuthStatus(meta), ...meta }, metaOAuth: { status: getOAuthStatus(meta), ...meta },
youtubeOAuth: { status: getOAuthStatus(youtube), ...youtube }, youtubeOAuth: { status: getOAuthStatus(youtube), ...youtube },
} };
} catch { } catch {
return { metaOAuth: errorStatus, youtubeOAuth: errorStatus } return { metaOAuth: errorStatus, youtubeOAuth: errorStatus };
} }
} }
export async function checkCronJobs(): Promise<CronStatuses> { export async function checkCronJobs(): Promise<CronStatuses> {
const unknownStatus: CronJobStatus = { lastRun: '', status: 'unknown' } const unknownStatus: CronJobStatus = { lastRun: "", status: "unknown" };
try { try {
const { getPayload } = await import('payload') const { getPayload } = await import("payload");
const payload = await getPayload({ config: (await import('@payload-config')).default }) const payload = await getPayload({ config: (await import("@payload-config")).default });
async function checkCron(source: string): Promise<CronJobStatus> { async function checkCron(source: string): Promise<CronJobStatus> {
try { try {
const logs = await payload.find({ const logs = await payload.find({
collection: 'monitoring-logs', collection: "monitoring-logs",
limit: 1, limit: 1,
sort: '-createdAt', sort: "-createdAt",
where: { where: {
and: [ and: [{ source: { equals: "cron" } }, { message: { contains: source } }],
{ source: { equals: 'cron' } },
{ message: { contains: source } },
],
}, },
}) });
if (logs.docs.length === 0) return unknownStatus if (logs.docs.length === 0) return unknownStatus;
const doc = logs.docs[0] as unknown as Record<string, unknown> const doc = logs.docs[0] as unknown as Record<string, unknown>;
return { return {
lastRun: doc.createdAt as string, lastRun: doc.createdAt as string,
status: doc.level === 'error' ? 'failed' : 'ok', status: doc.level === "error" ? "failed" : "ok",
} };
} catch { } catch {
return unknownStatus return unknownStatus;
} }
} }
const [communitySync, tokenRefresh, youtubeSync] = await Promise.all([ const [communitySync, tokenRefresh, youtubeSync] = await Promise.all([
checkCron('community-sync'), checkCron("community-sync"),
checkCron('token-refresh'), checkCron("token-refresh"),
checkCron('youtube'), checkCron("youtube"),
]) ]);
return { communitySync, tokenRefresh, youtubeSync } return { communitySync, tokenRefresh, youtubeSync };
} catch { } catch {
return { return {
communitySync: unknownStatus, communitySync: unknownStatus,
tokenRefresh: unknownStatus, tokenRefresh: unknownStatus,
youtubeSync: unknownStatus, youtubeSync: unknownStatus,
} };
} }
} }
@ -342,18 +337,19 @@ export async function checkCronJobs(): Promise<CronStatuses> {
* Collects all monitoring metrics in parallel. Individual check failures * Collects all monitoring metrics in parallel. Individual check failures
* are isolated and return safe defaults instead of failing the whole collection. * are isolated and return safe defaults instead of failing the whole collection.
*/ */
export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>> { export async function collectMetrics(): Promise<Omit<SystemMetrics, "timestamp">> {
const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] = await Promise.allSettled([ const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] =
checkSystemHealth(), await Promise.allSettled([
checkRedis(), checkSystemHealth(),
checkPostgresql(), checkRedis(),
checkPgBouncer(), checkPostgresql(),
checkSmtp(), checkPgBouncer(),
checkOAuthTokens(), checkSmtp(),
checkCronJobs(), checkOAuthTokens(),
Promise.resolve(checkSecretsHealth()), checkCronJobs(),
Promise.resolve(getSecurityMetricsSnapshot()), Promise.resolve(checkSecretsHealth()),
]) Promise.resolve(getSecurityMetricsSnapshot()),
]);
// Load performance tracker lazily to avoid circular dependencies // Load performance tracker lazily to avoid circular dependencies
let performance: PerformanceMetrics = { let performance: PerformanceMetrics = {
@ -362,51 +358,51 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
p99ResponseTimeMs: 0, p99ResponseTimeMs: 0,
errorRate: 0, errorRate: 0,
requestsPerMinute: 0, requestsPerMinute: 0,
} };
try { try {
// Dynamic path constructed at runtime to avoid Vite static analysis // Dynamic path constructed at runtime to avoid Vite static analysis
// when performance-tracker module has not been created yet // when performance-tracker module has not been created yet
const trackerPath = './performance-tracker' const trackerPath = "./performance-tracker";
const mod = await import(/* @vite-ignore */ trackerPath) const mod = await import(/* @vite-ignore */ trackerPath);
performance = mod.performanceTracker.getMetrics('1h') performance = mod.performanceTracker.getMetrics("1h");
} catch { } catch {
// Performance tracker not yet initialized // Performance tracker not yet initialized
} }
const defaultProcess: ProcessStatus = { const defaultProcess: ProcessStatus = {
status: 'offline', status: "offline",
pid: 0, pid: 0,
memoryMB: 0, memoryMB: 0,
uptimeSeconds: 0, uptimeSeconds: 0,
restarts: 0, restarts: 0,
} };
const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess) const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess);
const oauthDefaults = { const oauthDefaults = {
metaOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }, metaOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
youtubeOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }, youtubeOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
} };
const cronDefaults: CronStatuses = { const cronDefaults: CronStatuses = {
communitySync: { lastRun: '', status: 'unknown' }, communitySync: { lastRun: "", status: "unknown" },
tokenRefresh: { lastRun: '', status: 'unknown' }, tokenRefresh: { lastRun: "", status: "unknown" },
youtubeSync: { lastRun: '', status: 'unknown' }, youtubeSync: { lastRun: "", status: "unknown" },
} };
const secretsDefaults: SecretsHealthStatus = { const secretsDefaults: SecretsHealthStatus = {
status: 'critical', status: "critical",
checkedAt: new Date().toISOString(), checkedAt: new Date().toISOString(),
missing: [], missing: [],
expiringSoon: [], expiringSoon: [],
expired: [], expired: [],
rotationOverdue: [], rotationOverdue: [],
} };
const securityEventsDefaults: SecurityMetricsStatus = { const securityEventsDefaults: SecurityMetricsStatus = {
windowMs: 300000, windowMs: 300000,
counters: [], counters: [],
} };
const systemDefaults: SystemHealth = { const systemDefaults: SystemHealth = {
cpuUsagePercent: 0, cpuUsagePercent: 0,
@ -419,21 +415,21 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
loadAvg1: 0, loadAvg1: 0,
loadAvg5: 0, loadAvg5: 0,
uptime: 0, uptime: 0,
} };
const oauthResult = settled(oauth, oauthDefaults) const oauthResult = settled(oauth, oauthDefaults);
return { return {
system: settled(system, systemDefaults), system: settled(system, systemDefaults),
services: { services: {
payload: payloadProcess, payload: payloadProcess,
queueWorker: queueWorkerProcess, queueWorker: queueWorkerProcess,
postgresql: settled(postgresql, { status: 'offline', connections: 0, maxConnections: 50, latencyMs: -1 }), postgresql: settled(postgresql, { status: "offline", connections: 0, maxConnections: 50, latencyMs: -1 }),
pgbouncer: settled(pgbouncer, { status: 'offline', activeConnections: 0, waitingClients: 0, poolSize: 0 }), pgbouncer: settled(pgbouncer, { status: "offline", activeConnections: 0, waitingClients: 0, poolSize: 0 }),
redis: settled(redis, { status: 'offline', memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }), redis: settled(redis, { status: "offline", memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }),
}, },
external: { external: {
smtp: settled(smtp, { status: 'offline', lastCheck: new Date().toISOString(), responseTimeMs: -1 }), smtp: settled(smtp, { status: "offline", lastCheck: new Date().toISOString(), responseTimeMs: -1 }),
metaOAuth: oauthResult.metaOAuth, metaOAuth: oauthResult.metaOAuth,
youtubeOAuth: oauthResult.youtubeOAuth, youtubeOAuth: oauthResult.youtubeOAuth,
cronJobs: settled(cronJobs, cronDefaults), cronJobs: settled(cronJobs, cronDefaults),
@ -441,7 +437,7 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
securityEvents: settled(securityEvents, securityEventsDefaults), securityEvents: settled(securityEvents, securityEventsDefaults),
}, },
performance, performance,
} };
} }
// ============================================================================ // ============================================================================
@ -454,18 +450,18 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
*/ */
function runPsql(args: string): string { function runPsql(args: string): string {
return execSync(`psql ${args}`, { return execSync(`psql ${args}`, {
encoding: 'utf-8', encoding: "utf-8",
timeout: 5000, timeout: 5000,
env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || '' }, env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || "" },
}) });
} }
function roundToOneDecimal(value: number): number { function roundToOneDecimal(value: number): number {
return Math.round(value * 10) / 10 return Math.round(value * 10) / 10;
} }
function roundToTwoDecimals(value: number): number { function roundToTwoDecimals(value: number): number {
return Math.round(value * 100) / 100 return Math.round(value * 100) / 100;
} }
/** /**
@ -473,99 +469,95 @@ function roundToTwoDecimals(value: number): number {
* the fallback when the promise was rejected. * the fallback when the promise was rejected.
*/ */
function settled<T>(result: PromiseSettledResult<T>, fallback: T): T { function settled<T>(result: PromiseSettledResult<T>, fallback: T): T {
return result.status === 'fulfilled' ? result.value : fallback return result.status === "fulfilled" ? result.value : fallback;
} }
async function getCpuUsage(): Promise<number> { async function getCpuUsage(): Promise<number> {
try { try {
const fs = await import('node:fs/promises') const fs = await import("node:fs/promises");
const stat1 = await fs.readFile('/proc/stat', 'utf-8') const stat1 = await fs.readFile("/proc/stat", "utf-8");
await new Promise((resolve) => setTimeout(resolve, 100)) await new Promise((resolve) => setTimeout(resolve, 100));
const stat2 = await fs.readFile('/proc/stat', 'utf-8') const stat2 = await fs.readFile("/proc/stat", "utf-8");
const parse = (data: string): { idle: number; total: number } => { const parse = (data: string): { idle: number; total: number } => {
const line = data.split('\n')[0] // first line: cpu user nice system idle ... const line = data.split("\n")[0]; // first line: cpu user nice system idle ...
const parts = line.split(/\s+/).slice(1).map(Number) const parts = line.split(/\s+/).slice(1).map(Number);
const idle = parts[3] + (parts[4] || 0) // idle + iowait const idle = parts[3] + (parts[4] || 0); // idle + iowait
const total = parts.reduce((a, b) => a + b, 0) const total = parts.reduce((a, b) => a + b, 0);
return { idle, total } return { idle, total };
} };
const s1 = parse(stat1) const s1 = parse(stat1);
const s2 = parse(stat2) const s2 = parse(stat2);
const idleDiff = s2.idle - s1.idle const idleDiff = s2.idle - s1.idle;
const totalDiff = s2.total - s1.total const totalDiff = s2.total - s1.total;
if (totalDiff === 0) return 0 if (totalDiff === 0) return 0;
return ((totalDiff - idleDiff) / totalDiff) * 100 return ((totalDiff - idleDiff) / totalDiff) * 100;
} catch { } catch {
// Fallback if /proc/stat is unavailable // Fallback if /proc/stat is unavailable
const cpuCount = os.cpus().length const cpuCount = os.cpus().length;
return (os.loadavg()[0] / cpuCount) * 100 return (os.loadavg()[0] / cpuCount) * 100;
} }
} }
function getDiskUsage(): { diskUsedGB: number; diskTotalGB: number; diskUsagePercent: number } { function getDiskUsage(): { diskUsedGB: number; diskTotalGB: number; diskUsagePercent: number } {
try { try {
const output = execSync('df -B1 / | tail -1', { encoding: 'utf-8' }) const output = execSync("df -B1 / | tail -1", { encoding: "utf-8" });
const parts = output.trim().split(/\s+/) const parts = output.trim().split(/\s+/);
// Format: filesystem 1B-blocks used available use% mountpoint // Format: filesystem 1B-blocks used available use% mountpoint
const total = parseInt(parts[1], 10) const total = parseInt(parts[1], 10);
const used = parseInt(parts[2], 10) const used = parseInt(parts[2], 10);
return { return {
diskTotalGB: roundToOneDecimal(total / 1024 / 1024 / 1024), diskTotalGB: roundToOneDecimal(total / 1024 / 1024 / 1024),
diskUsedGB: roundToOneDecimal(used / 1024 / 1024 / 1024), diskUsedGB: roundToOneDecimal(used / 1024 / 1024 / 1024),
diskUsagePercent: roundToOneDecimal((used / total) * 100), diskUsagePercent: roundToOneDecimal((used / total) * 100),
} };
} catch { } catch {
return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 } return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 };
} }
} }
function getOAuthStatus( function getOAuthStatus(counts: { tokensExpired: number; tokensExpiringSoon: number }): OAuthTokenStatus["status"] {
counts: { tokensExpired: number; tokensExpiringSoon: number }, if (counts.tokensExpired > 0) return "expired";
): OAuthTokenStatus['status'] { if (counts.tokensExpiringSoon > 0) return "expiring_soon";
if (counts.tokensExpired > 0) return 'expired' return "ok";
if (counts.tokensExpiringSoon > 0) return 'expiring_soon'
return 'ok'
} }
interface Pm2Processes { interface Pm2Processes {
payloadProcess: ProcessStatus payloadProcess: ProcessStatus;
queueWorkerProcess: ProcessStatus queueWorkerProcess: ProcessStatus;
} }
function getPm2Processes(defaultProcess: ProcessStatus): Pm2Processes { function getPm2Processes(defaultProcess: ProcessStatus): Pm2Processes {
let payloadProcess = defaultProcess let payloadProcess = defaultProcess;
let queueWorkerProcess = defaultProcess let queueWorkerProcess = defaultProcess;
try { try {
const pm2Out = execSync('pm2 jlist', { encoding: 'utf-8', timeout: 5000 }) const pm2Out = execSync("pm2 jlist", { encoding: "utf-8", timeout: 5000 });
const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>> const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>>;
for (const proc of pm2List) { for (const proc of pm2List) {
const env = proc.pm2_env as Record<string, unknown> | undefined const env = proc.pm2_env as Record<string, unknown> | undefined;
const monit = proc.monit as Record<string, number> | undefined const monit = proc.monit as Record<string, number> | undefined;
const info: ProcessStatus = { const info: ProcessStatus = {
status: env?.status === 'online' ? 'online' : 'offline', status: env?.status === "online" ? "online" : "offline",
pid: (proc.pid as number) || 0, pid: (proc.pid as number) || 0,
memoryMB: Math.round((monit?.memory || 0) / 1024 / 1024), memoryMB: Math.round((monit?.memory || 0) / 1024 / 1024),
uptimeSeconds: env?.pm_uptime uptimeSeconds: env?.pm_uptime ? Math.round((Date.now() - (env.pm_uptime as number)) / 1000) : 0,
? Math.round((Date.now() - (env.pm_uptime as number)) / 1000)
: 0,
restarts: (env?.restart_time as number) || 0, restarts: (env?.restart_time as number) || 0,
} };
if (proc.name === 'payload') { if (proc.name === "payload") {
payloadProcess = info payloadProcess = info;
} else if (proc.name === 'queue-worker') { } else if (proc.name === "queue-worker") {
queueWorkerProcess = info queueWorkerProcess = info;
} }
} }
} catch { } catch {
// PM2 not available // PM2 not available
} }
return { payloadProcess, queueWorkerProcess } return { payloadProcess, queueWorkerProcess };
} }

View file

@ -7,14 +7,14 @@
* error rates, and throughput. * error rates, and throughput.
*/ */
import type { PerformanceEntry, PerformanceMetrics } from './types' import type { PerformanceEntry, PerformanceMetrics } from "./types";
const PERIOD_MS: Record<string, number> = { const PERIOD_MS: Record<string, number> = {
'1h': 3_600_000, "1h": 3_600_000,
'6h': 21_600_000, "6h": 21_600_000,
'24h': 86_400_000, "24h": 86_400_000,
'7d': 604_800_000, "7d": 604_800_000,
} };
const EMPTY_METRICS: PerformanceMetrics = { const EMPTY_METRICS: PerformanceMetrics = {
avgResponseTimeMs: 0, avgResponseTimeMs: 0,
@ -22,17 +22,17 @@ const EMPTY_METRICS: PerformanceMetrics = {
p99ResponseTimeMs: 0, p99ResponseTimeMs: 0,
errorRate: 0, errorRate: 0,
requestsPerMinute: 0, requestsPerMinute: 0,
} };
export class PerformanceTracker { export class PerformanceTracker {
private readonly buffer: PerformanceEntry[] private readonly buffer: PerformanceEntry[];
private pointer: number = 0 private pointer: number = 0;
private count: number = 0 private count: number = 0;
private readonly capacity: number private readonly capacity: number;
constructor(capacity: number = 10_000) { constructor(capacity: number = 10_000) {
this.capacity = capacity this.capacity = capacity;
this.buffer = new Array(capacity) this.buffer = new Array(capacity);
} }
track(method: string, path: string, statusCode: number, durationMs: number): void { track(method: string, path: string, statusCode: number, durationMs: number): void {
@ -42,40 +42,40 @@ export class PerformanceTracker {
path, path,
statusCode, statusCode,
durationMs, durationMs,
} };
this.pointer = (this.pointer + 1) % this.capacity this.pointer = (this.pointer + 1) % this.capacity;
if (this.count < this.capacity) { if (this.count < this.capacity) {
this.count++ this.count++;
} }
} }
getMetrics(period: '1h' | '6h' | '24h' | '7d' = '1h'): PerformanceMetrics { getMetrics(period: "1h" | "6h" | "24h" | "7d" = "1h"): PerformanceMetrics {
const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS['1h']) const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS["1h"]);
const entries: PerformanceEntry[] = [] const entries: PerformanceEntry[] = [];
for (let i = 0; i < this.count; i++) { for (let i = 0; i < this.count; i++) {
const entry = this.buffer[i] const entry = this.buffer[i];
if (entry && entry.timestamp >= cutoff) { if (entry && entry.timestamp >= cutoff) {
entries.push(entry) entries.push(entry);
} }
} }
if (entries.length === 0) { if (entries.length === 0) {
return { ...EMPTY_METRICS } return { ...EMPTY_METRICS };
} }
const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b) const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b);
const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length;
const p95 = percentile(durations, 0.95) const p95 = percentile(durations, 0.95);
const p99 = percentile(durations, 0.99) const p99 = percentile(durations, 0.99);
const errorCount = entries.filter((e) => e.statusCode >= 500).length const errorCount = entries.filter((e) => e.statusCode >= 500).length;
const errorRate = errorCount / entries.length const errorRate = errorCount / entries.length;
const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp)) const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp));
const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1) const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1);
const requestsPerMinute = entries.length / windowMinutes const requestsPerMinute = entries.length / windowMinutes;
return { return {
avgResponseTimeMs: Math.round(avg), avgResponseTimeMs: Math.round(avg),
@ -83,14 +83,14 @@ export class PerformanceTracker {
p99ResponseTimeMs: p99, p99ResponseTimeMs: p99,
errorRate: Math.round(errorRate * 1000) / 1000, errorRate: Math.round(errorRate * 1000) / 1000,
requestsPerMinute: Math.round(requestsPerMinute * 10) / 10, requestsPerMinute: Math.round(requestsPerMinute * 10) / 10,
} };
} }
} }
function percentile(sorted: number[], p: number): number { function percentile(sorted: number[], p: number): number {
const index = Math.floor(sorted.length * p) const index = Math.floor(sorted.length * p);
return sorted[Math.min(index, sorted.length - 1)] return sorted[Math.min(index, sorted.length - 1)];
} }
/** Singleton instance used across the application. */ /** Singleton instance used across the application. */
export const performanceTracker = new PerformanceTracker(10_000) export const performanceTracker = new PerformanceTracker(10_000);

View file

@ -6,62 +6,62 @@
* sie in MonitoringSnapshots. Evaluiert dabei Alert-Regeln. * sie in MonitoringSnapshots. Evaluiert dabei Alert-Regeln.
*/ */
import { collectMetrics } from './monitoring-service' import { collectMetrics } from "./monitoring-service";
import { AlertEvaluator } from './alert-evaluator' import { AlertEvaluator } from "./alert-evaluator";
let interval: ReturnType<typeof setInterval> | null = null let interval: ReturnType<typeof setInterval> | null = null;
const alertEvaluator = new AlertEvaluator() const alertEvaluator = new AlertEvaluator();
/** Cached Payload instance — resolved once, reused on every tick. */ /** Cached Payload instance — resolved once, reused on every tick. */
let cachedPayload: any = null let cachedPayload: any = null;
async function getPayloadInstance(): Promise<any> { async function getPayloadInstance(): Promise<any> {
if (cachedPayload) return cachedPayload if (cachedPayload) return cachedPayload;
const { getPayload } = await import('payload') const { getPayload } = await import("payload");
const config = (await import(/* @vite-ignore */ '@payload-config')).default const config = (await import(/* @vite-ignore */ "@payload-config")).default;
cachedPayload = await getPayload({ config }) cachedPayload = await getPayload({ config });
return cachedPayload return cachedPayload;
} }
export async function startSnapshotCollector(): Promise<void> { export async function startSnapshotCollector(): Promise<void> {
const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || '60000', 10) const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || "60000", 10);
console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`) console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`);
// Run immediately once, then on interval // Run immediately once, then on interval
await collectAndSave() await collectAndSave();
interval = setInterval(async () => { interval = setInterval(async () => {
await collectAndSave() await collectAndSave();
}, INTERVAL) }, INTERVAL);
} }
async function collectAndSave(): Promise<void> { async function collectAndSave(): Promise<void> {
try { try {
const payload = await getPayloadInstance() const payload = await getPayloadInstance();
const metrics = await collectMetrics() const metrics = await collectMetrics();
await (payload as any).create({ await (payload as any).create({
collection: 'monitoring-snapshots', collection: "monitoring-snapshots",
data: { data: {
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
...metrics, ...metrics,
}, },
}) });
// Evaluate alert rules against collected metrics // Evaluate alert rules against collected metrics
await alertEvaluator.evaluateRules(payload as any, metrics) await alertEvaluator.evaluateRules(payload as any, metrics);
} catch (error) { } catch (error) {
console.error('[SnapshotCollector] Error:', error) console.error("[SnapshotCollector] Error:", error);
// Reset cache on error so next tick re-resolves // Reset cache on error so next tick re-resolves
cachedPayload = null cachedPayload = null;
} }
} }
export async function stopSnapshotCollector(): Promise<void> { export async function stopSnapshotCollector(): Promise<void> {
if (interval) { if (interval) {
clearInterval(interval) clearInterval(interval);
interval = null interval = null;
} }
console.log('[SnapshotCollector] Stopped') console.log("[SnapshotCollector] Stopped");
} }

View file

@ -1,192 +1,185 @@
// === System Health === // === System Health ===
export interface SystemHealth { export interface SystemHealth {
cpuUsagePercent: number cpuUsagePercent: number;
memoryUsedMB: number memoryUsedMB: number;
memoryTotalMB: number memoryTotalMB: number;
memoryUsagePercent: number memoryUsagePercent: number;
diskUsedGB: number diskUsedGB: number;
diskTotalGB: number diskTotalGB: number;
diskUsagePercent: number diskUsagePercent: number;
loadAvg1: number loadAvg1: number;
loadAvg5: number loadAvg5: number;
uptime: number // seconds uptime: number; // seconds
} }
// === Service Statuses === // === Service Statuses ===
export type ServiceStatusType = 'online' | 'warning' | 'offline' export type ServiceStatusType = "online" | "warning" | "offline";
export interface ProcessStatus { export interface ProcessStatus {
status: ServiceStatusType status: ServiceStatusType;
pid: number pid: number;
memoryMB: number memoryMB: number;
uptimeSeconds: number uptimeSeconds: number;
restarts: number restarts: number;
} }
export interface PostgresqlStatus { export interface PostgresqlStatus {
status: ServiceStatusType status: ServiceStatusType;
connections: number connections: number;
maxConnections: number maxConnections: number;
latencyMs: number latencyMs: number;
} }
export interface PgBouncerStatus { export interface PgBouncerStatus {
status: ServiceStatusType status: ServiceStatusType;
activeConnections: number activeConnections: number;
waitingClients: number waitingClients: number;
poolSize: number poolSize: number;
} }
export interface RedisStatus { export interface RedisStatus {
status: ServiceStatusType status: ServiceStatusType;
memoryUsedMB: number memoryUsedMB: number;
connectedClients: number connectedClients: number;
opsPerSec: number opsPerSec: number;
} }
export interface ServiceStatuses { export interface ServiceStatuses {
payload: ProcessStatus payload: ProcessStatus;
queueWorker: ProcessStatus queueWorker: ProcessStatus;
postgresql: PostgresqlStatus postgresql: PostgresqlStatus;
pgbouncer: PgBouncerStatus pgbouncer: PgBouncerStatus;
redis: RedisStatus redis: RedisStatus;
} }
// === External Statuses === // === External Statuses ===
export interface SmtpStatus { export interface SmtpStatus {
status: ServiceStatusType status: ServiceStatusType;
lastCheck: string // ISO date lastCheck: string; // ISO date
responseTimeMs: number responseTimeMs: number;
} }
export type OAuthStatusType = 'ok' | 'expiring_soon' | 'expired' | 'error' export type OAuthStatusType = "ok" | "expiring_soon" | "expired" | "error";
export interface OAuthTokenStatus { export interface OAuthTokenStatus {
status: OAuthStatusType status: OAuthStatusType;
tokensTotal: number tokensTotal: number;
tokensExpiringSoon: number tokensExpiringSoon: number;
tokensExpired: number tokensExpired: number;
} }
export interface CronJobStatus { export interface CronJobStatus {
lastRun: string // ISO date lastRun: string; // ISO date
status: 'ok' | 'failed' | 'unknown' status: "ok" | "failed" | "unknown";
} }
export interface CronStatuses { export interface CronStatuses {
communitySync: CronJobStatus communitySync: CronJobStatus;
tokenRefresh: CronJobStatus tokenRefresh: CronJobStatus;
youtubeSync: CronJobStatus youtubeSync: CronJobStatus;
} }
export interface SecretExpiringSoon { export interface SecretExpiringSoon {
name: string name: string;
expiresAt: string expiresAt: string;
daysRemaining: number daysRemaining: number;
} }
export interface SecretExpired { export interface SecretExpired {
name: string name: string;
expiresAt: string expiresAt: string;
} }
export interface SecretRotationOverdue { export interface SecretRotationOverdue {
name: string name: string;
rotatedAt: string rotatedAt: string;
ageDays: number ageDays: number;
} }
export interface SecretsHealthStatus { export interface SecretsHealthStatus {
status: 'ok' | 'warning' | 'critical' status: "ok" | "warning" | "critical";
checkedAt: string checkedAt: string;
missing: string[] missing: string[];
expiringSoon: SecretExpiringSoon[] expiringSoon: SecretExpiringSoon[];
expired: SecretExpired[] expired: SecretExpired[];
rotationOverdue: SecretRotationOverdue[] rotationOverdue: SecretRotationOverdue[];
} }
export interface SecurityMetricsStatus { export interface SecurityMetricsStatus {
windowMs: number windowMs: number;
counters: Array<{ counters: Array<{
eventType: string eventType: string;
count: number count: number;
windowStart: string windowStart: string;
}> }>;
} }
export interface ExternalStatuses { export interface ExternalStatuses {
smtp: SmtpStatus smtp: SmtpStatus;
metaOAuth: OAuthTokenStatus metaOAuth: OAuthTokenStatus;
youtubeOAuth: OAuthTokenStatus youtubeOAuth: OAuthTokenStatus;
cronJobs: CronStatuses cronJobs: CronStatuses;
secrets: SecretsHealthStatus secrets: SecretsHealthStatus;
securityEvents: SecurityMetricsStatus securityEvents: SecurityMetricsStatus;
} }
// === Performance === // === Performance ===
export interface PerformanceMetrics { export interface PerformanceMetrics {
avgResponseTimeMs: number avgResponseTimeMs: number;
p95ResponseTimeMs: number p95ResponseTimeMs: number;
p99ResponseTimeMs: number p99ResponseTimeMs: number;
errorRate: number // 0-1 errorRate: number; // 0-1
requestsPerMinute: number requestsPerMinute: number;
} }
// === Full Snapshot === // === Full Snapshot ===
export interface SystemMetrics { export interface SystemMetrics {
timestamp: string // ISO date timestamp: string; // ISO date
system: SystemHealth system: SystemHealth;
services: ServiceStatuses services: ServiceStatuses;
external: ExternalStatuses external: ExternalStatuses;
performance: PerformanceMetrics performance: PerformanceMetrics;
} }
// === SSE Events (discriminated union) === // === SSE Events (discriminated union) ===
export type MonitoringEvent = export type MonitoringEvent =
| { type: 'health'; data: SystemHealth } | { type: "health"; data: SystemHealth }
| { type: 'service'; data: Partial<ServiceStatuses> } | { type: "service"; data: Partial<ServiceStatuses> }
| { type: 'alert'; data: AlertEvent } | { type: "alert"; data: AlertEvent }
| { type: 'log'; data: LogEvent } | { type: "log"; data: LogEvent }
| { type: 'performance'; data: PerformanceMetrics } | { type: "performance"; data: PerformanceMetrics };
export interface AlertEvent { export interface AlertEvent {
id: string id: string;
ruleId: string ruleId: string;
metric: string metric: string;
value: number value: number;
threshold: number threshold: number;
severity: AlertSeverity severity: AlertSeverity;
message: string message: string;
timestamp: string timestamp: string;
} }
export interface LogEvent { export interface LogEvent {
id: string id: string;
level: LogLevel level: LogLevel;
source: LogSource source: LogSource;
message: string message: string;
timestamp: string timestamp: string;
context?: Record<string, unknown> context?: Record<string, unknown>;
} }
// === Enums as union types === // === Enums as union types ===
export type AlertCondition = 'gt' | 'lt' | 'eq' | 'gte' | 'lte' export type AlertCondition = "gt" | "lt" | "eq" | "gte" | "lte";
export type AlertSeverity = 'warning' | 'error' | 'critical' export type AlertSeverity = "warning" | "error" | "critical";
export type LogLevel = 'debug' | 'info' | 'warn' | 'error' | 'fatal' export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
export type LogSource = export type LogSource = "payload" | "queue-worker" | "cron" | "email" | "oauth" | "sync" | "security";
| 'payload'
| 'queue-worker'
| 'cron'
| 'email'
| 'oauth'
| 'sync'
| 'security'
// === Performance Tracker Entry === // === Performance Tracker Entry ===
export interface PerformanceEntry { export interface PerformanceEntry {
timestamp: number // Date.now() timestamp: number; // Date.now()
method: string method: string;
path: string path: string;
statusCode: number statusCode: number;
durationMs: number durationMs: number;
} }