mirror of
https://github.com/complexcaresolutions/cms.c2sgmbh.git
synced 2026-03-17 19:44:12 +00:00
fix(ci): increase build heap size and format monitoring files
Build was OOM-ing in CI with default Node heap limit. Added NODE_OPTIONS with 4GB heap. Also ran Prettier on monitoring files. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
884d33c0ae
commit
037835d1de
7 changed files with 481 additions and 510 deletions
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
|
|
@ -212,6 +212,7 @@ jobs:
|
|||
- name: Build application
|
||||
run: pnpm build
|
||||
env:
|
||||
NODE_OPTIONS: '--max-old-space-size=4096'
|
||||
# Minimal env vars for build
|
||||
PAYLOAD_SECRET: build-secret-placeholder
|
||||
DATABASE_URI: postgresql://placeholder:placeholder@localhost:5432/placeholder
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@
|
|||
* cooldown periods, and multi-channel alert dispatch.
|
||||
*/
|
||||
|
||||
import type { Payload } from 'payload'
|
||||
import type { AlertCondition, AlertSeverity, SystemMetrics } from './types'
|
||||
import type { Payload } from "payload";
|
||||
import type { AlertCondition, AlertSeverity, SystemMetrics } from "./types";
|
||||
|
||||
// ============================================================================
|
||||
// Pure Functions
|
||||
|
|
@ -18,40 +18,36 @@ import type { AlertCondition, AlertSeverity, SystemMetrics } from './types'
|
|||
* Example: getMetricValue(metrics, 'system.cpuUsagePercent') => 92
|
||||
*/
|
||||
export function getMetricValue(metrics: Record<string, unknown>, path: string): number | undefined {
|
||||
const parts = path.split('.')
|
||||
let current: unknown = metrics
|
||||
const parts = path.split(".");
|
||||
let current: unknown = metrics;
|
||||
|
||||
for (const part of parts) {
|
||||
if (current === null || current === undefined || typeof current !== 'object') {
|
||||
return undefined
|
||||
if (current === null || current === undefined || typeof current !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
current = (current as Record<string, unknown>)[part]
|
||||
current = (current as Record<string, unknown>)[part];
|
||||
}
|
||||
|
||||
return typeof current === 'number' ? current : undefined
|
||||
return typeof current === "number" ? current : undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates a condition against a value and threshold.
|
||||
*/
|
||||
export function evaluateCondition(
|
||||
condition: AlertCondition,
|
||||
value: number,
|
||||
threshold: number,
|
||||
): boolean {
|
||||
export function evaluateCondition(condition: AlertCondition, value: number, threshold: number): boolean {
|
||||
switch (condition) {
|
||||
case 'gt':
|
||||
return value > threshold
|
||||
case 'lt':
|
||||
return value < threshold
|
||||
case 'eq':
|
||||
return value === threshold
|
||||
case 'gte':
|
||||
return value >= threshold
|
||||
case 'lte':
|
||||
return value <= threshold
|
||||
case "gt":
|
||||
return value > threshold;
|
||||
case "lt":
|
||||
return value < threshold;
|
||||
case "eq":
|
||||
return value === threshold;
|
||||
case "gte":
|
||||
return value >= threshold;
|
||||
case "lte":
|
||||
return value <= threshold;
|
||||
default:
|
||||
return false
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -60,28 +56,28 @@ export function evaluateCondition(
|
|||
// ============================================================================
|
||||
|
||||
interface AlertRule {
|
||||
id: number
|
||||
name: string
|
||||
metric: string
|
||||
condition: AlertCondition
|
||||
threshold: number
|
||||
severity: AlertSeverity
|
||||
channels: Array<'email' | 'slack' | 'discord'>
|
||||
id: number;
|
||||
name: string;
|
||||
metric: string;
|
||||
condition: AlertCondition;
|
||||
threshold: number;
|
||||
severity: AlertSeverity;
|
||||
channels: Array<"email" | "slack" | "discord">;
|
||||
recipients?: {
|
||||
emails?: Array<{ email: string }>
|
||||
slackWebhook?: string
|
||||
discordWebhook?: string
|
||||
}
|
||||
cooldownMinutes: number
|
||||
enabled: boolean
|
||||
emails?: Array<{ email: string }>;
|
||||
slackWebhook?: string;
|
||||
discordWebhook?: string;
|
||||
};
|
||||
cooldownMinutes: number;
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
// Maps AlertSeverity to the AlertLevel expected by alert-service
|
||||
const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
||||
warning: 'warning',
|
||||
error: 'error',
|
||||
critical: 'critical',
|
||||
}
|
||||
warning: "warning",
|
||||
error: "error",
|
||||
critical: "critical",
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// AlertEvaluator Class
|
||||
|
|
@ -89,58 +85,52 @@ const SEVERITY_TO_LEVEL: Record<AlertSeverity, string> = {
|
|||
|
||||
export class AlertEvaluator {
|
||||
/** Tracks last fire time per rule to enforce cooldown */
|
||||
private cooldownMap: Map<string, number> = new Map()
|
||||
private cooldownMap: Map<string, number> = new Map();
|
||||
|
||||
/**
|
||||
* Returns true if the rule should fire (not in cooldown).
|
||||
*/
|
||||
shouldFire(ruleId: string, cooldownMinutes: number): boolean {
|
||||
const lastFired = this.cooldownMap.get(ruleId)
|
||||
const lastFired = this.cooldownMap.get(ruleId);
|
||||
if (lastFired) {
|
||||
const elapsedMinutes = (Date.now() - lastFired) / 60_000
|
||||
if (elapsedMinutes < cooldownMinutes) return false
|
||||
const elapsedMinutes = (Date.now() - lastFired) / 60_000;
|
||||
if (elapsedMinutes < cooldownMinutes) return false;
|
||||
}
|
||||
return true
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Record that a rule fired successfully. */
|
||||
recordFired(ruleId: string): void {
|
||||
this.cooldownMap.set(ruleId, Date.now())
|
||||
this.cooldownMap.set(ruleId, Date.now());
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates all enabled rules against current metrics.
|
||||
* Fires alerts for rules that match and are not in cooldown.
|
||||
*/
|
||||
async evaluateRules(
|
||||
payload: Payload,
|
||||
metrics: Omit<SystemMetrics, 'timestamp'>,
|
||||
): Promise<void> {
|
||||
async evaluateRules(payload: Payload, metrics: Omit<SystemMetrics, "timestamp">): Promise<void> {
|
||||
try {
|
||||
const rules = await payload.find({
|
||||
collection: 'monitoring-alert-rules',
|
||||
collection: "monitoring-alert-rules",
|
||||
where: { enabled: { equals: true } },
|
||||
limit: 100,
|
||||
})
|
||||
});
|
||||
|
||||
for (const doc of rules.docs) {
|
||||
const rule = doc as unknown as AlertRule
|
||||
const value = getMetricValue(
|
||||
metrics as unknown as Record<string, unknown>,
|
||||
rule.metric,
|
||||
)
|
||||
if (value === undefined) continue
|
||||
const rule = doc as unknown as AlertRule;
|
||||
const value = getMetricValue(metrics as unknown as Record<string, unknown>, rule.metric);
|
||||
if (value === undefined) continue;
|
||||
|
||||
if (evaluateCondition(rule.condition, value, rule.threshold)) {
|
||||
const ruleKey = String(rule.id)
|
||||
const ruleKey = String(rule.id);
|
||||
if (this.shouldFire(ruleKey, rule.cooldownMinutes)) {
|
||||
await this.dispatchAlert(payload, rule, value)
|
||||
this.recordFired(ruleKey)
|
||||
await this.dispatchAlert(payload, rule, value);
|
||||
this.recordFired(ruleKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[AlertEvaluator] Error evaluating rules:', error)
|
||||
console.error("[AlertEvaluator] Error evaluating rules:", error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -149,11 +139,11 @@ export class AlertEvaluator {
|
|||
* via the existing alert service.
|
||||
*/
|
||||
private async dispatchAlert(payload: Payload, rule: AlertRule, value: number): Promise<void> {
|
||||
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`
|
||||
const message = `${rule.name}: ${rule.metric} = ${value} (threshold: ${rule.condition} ${rule.threshold})`;
|
||||
|
||||
try {
|
||||
await payload.create({
|
||||
collection: 'monitoring-alert-history',
|
||||
collection: "monitoring-alert-history",
|
||||
data: {
|
||||
rule: rule.id,
|
||||
metric: rule.metric,
|
||||
|
|
@ -163,13 +153,13 @@ export class AlertEvaluator {
|
|||
message,
|
||||
channelsSent: rule.channels,
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
// Try to send via existing alert service
|
||||
try {
|
||||
const { sendAlert } = await import('../alerting/alert-service.js')
|
||||
const { sendAlert } = await import("../alerting/alert-service.js");
|
||||
await sendAlert(payload, {
|
||||
level: SEVERITY_TO_LEVEL[rule.severity] as 'warning' | 'error' | 'critical',
|
||||
level: SEVERITY_TO_LEVEL[rule.severity] as "warning" | "error" | "critical",
|
||||
title: `[${rule.severity.toUpperCase()}] ${rule.name}`,
|
||||
message,
|
||||
details: {
|
||||
|
|
@ -178,13 +168,13 @@ export class AlertEvaluator {
|
|||
threshold: rule.threshold,
|
||||
condition: rule.condition,
|
||||
},
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Alert service not available, history record is sufficient
|
||||
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`)
|
||||
console.warn(`[AlertEvaluator] Could not dispatch via alert-service: ${message}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[AlertEvaluator] Error dispatching alert:', error)
|
||||
console.error("[AlertEvaluator] Error dispatching alert:", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
* Falls back to console output when Payload is not yet initialized.
|
||||
*/
|
||||
|
||||
import type { LogLevel, LogSource } from './types'
|
||||
import type { LogLevel, LogSource } from "./types";
|
||||
|
||||
const LOG_LEVELS: Record<LogLevel, number> = {
|
||||
debug: 0,
|
||||
|
|
@ -13,63 +13,58 @@ const LOG_LEVELS: Record<LogLevel, number> = {
|
|||
warn: 2,
|
||||
error: 3,
|
||||
fatal: 4,
|
||||
}
|
||||
};
|
||||
|
||||
function getMinLevel(): LogLevel {
|
||||
return (process.env.MONITORING_LOG_LEVEL as LogLevel) || 'info'
|
||||
return (process.env.MONITORING_LOG_LEVEL as LogLevel) || "info";
|
||||
}
|
||||
|
||||
function shouldLog(level: LogLevel): boolean {
|
||||
return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()]
|
||||
return LOG_LEVELS[level] >= LOG_LEVELS[getMinLevel()];
|
||||
}
|
||||
|
||||
export interface LogContext {
|
||||
requestId?: string
|
||||
userId?: number
|
||||
tenant?: number
|
||||
duration?: number
|
||||
[key: string]: unknown
|
||||
requestId?: string;
|
||||
userId?: number;
|
||||
tenant?: number;
|
||||
duration?: number;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface MonitoringLoggerInstance {
|
||||
debug(message: string, context?: LogContext): void
|
||||
info(message: string, context?: LogContext): void
|
||||
warn(message: string, context?: LogContext): void
|
||||
error(message: string, context?: LogContext): void
|
||||
fatal(message: string, context?: LogContext): void
|
||||
debug(message: string, context?: LogContext): void;
|
||||
info(message: string, context?: LogContext): void;
|
||||
warn(message: string, context?: LogContext): void;
|
||||
error(message: string, context?: LogContext): void;
|
||||
fatal(message: string, context?: LogContext): void;
|
||||
}
|
||||
|
||||
/** Cached Payload instance — resolved once, reused for all subsequent writes. */
|
||||
let cachedPayload: any = null
|
||||
let cachedPayload: any = null;
|
||||
|
||||
async function getPayloadInstance(): Promise<any> {
|
||||
if (cachedPayload) return cachedPayload
|
||||
const { getPayload } = await import('payload')
|
||||
const config = (await import(/* @vite-ignore */ '@payload-config')).default
|
||||
cachedPayload = await getPayload({ config })
|
||||
return cachedPayload
|
||||
if (cachedPayload) return cachedPayload;
|
||||
const { getPayload } = await import("payload");
|
||||
const config = (await import(/* @vite-ignore */ "@payload-config")).default;
|
||||
cachedPayload = await getPayload({ config });
|
||||
return cachedPayload;
|
||||
}
|
||||
|
||||
/** Reset cached instance (used in tests). */
|
||||
export function _resetPayloadCache(): void {
|
||||
cachedPayload = null
|
||||
cachedPayload = null;
|
||||
}
|
||||
|
||||
async function writeLog(
|
||||
source: LogSource,
|
||||
level: LogLevel,
|
||||
message: string,
|
||||
context?: LogContext,
|
||||
): Promise<void> {
|
||||
if (!shouldLog(level)) return
|
||||
async function writeLog(source: LogSource, level: LogLevel, message: string, context?: LogContext): Promise<void> {
|
||||
if (!shouldLog(level)) return;
|
||||
|
||||
try {
|
||||
const payload = await getPayloadInstance()
|
||||
const payload = await getPayloadInstance();
|
||||
|
||||
const { requestId, userId, tenant, duration, ...rest } = context || {}
|
||||
const { requestId, userId, tenant, duration, ...rest } = context || {};
|
||||
|
||||
await payload.create({
|
||||
collection: 'monitoring-logs',
|
||||
collection: "monitoring-logs",
|
||||
data: {
|
||||
level,
|
||||
source,
|
||||
|
|
@ -80,12 +75,12 @@ async function writeLog(
|
|||
tenant,
|
||||
duration,
|
||||
},
|
||||
})
|
||||
});
|
||||
} catch {
|
||||
// Fallback to console if Payload is not yet initialized
|
||||
cachedPayload = null
|
||||
const prefix = `[${source}][${level.toUpperCase()}]`
|
||||
console.log(prefix, message, context || '')
|
||||
cachedPayload = null;
|
||||
const prefix = `[${source}][${level.toUpperCase()}]`;
|
||||
console.log(prefix, message, context || "");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -94,16 +89,16 @@ export function createMonitoringLogger(source: LogSource): MonitoringLoggerInsta
|
|||
return function logMessage(message: string, context?: LogContext): void {
|
||||
// Fire-and-forget -- don't block the caller
|
||||
writeLog(source, level, message, context).catch(function onError(err) {
|
||||
console.error(`[MonitoringLogger] Failed to write ${level} log:`, err)
|
||||
})
|
||||
}
|
||||
console.error(`[MonitoringLogger] Failed to write ${level} log:`, err);
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
debug: log('debug'),
|
||||
info: log('info'),
|
||||
warn: log('warn'),
|
||||
error: log('error'),
|
||||
fatal: log('fatal'),
|
||||
}
|
||||
debug: log("debug"),
|
||||
info: log("info"),
|
||||
warn: log("warn"),
|
||||
error: log("error"),
|
||||
fatal: log("fatal"),
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,8 +5,8 @@
|
|||
* dependency checks. Used by the monitoring dashboard and snapshot collector.
|
||||
*/
|
||||
|
||||
import os from 'node:os'
|
||||
import { execSync } from 'node:child_process'
|
||||
import os from "node:os";
|
||||
import { execSync } from "node:child_process";
|
||||
import type {
|
||||
SystemHealth,
|
||||
ProcessStatus,
|
||||
|
|
@ -21,9 +21,9 @@ import type {
|
|||
SecurityMetricsStatus,
|
||||
PerformanceMetrics,
|
||||
SystemMetrics,
|
||||
} from './types'
|
||||
import { checkSecretsHealth } from '../security/secrets-health'
|
||||
import { getSecurityMetricsSnapshot } from '../security/security-observability'
|
||||
} from "./types";
|
||||
import { checkSecretsHealth } from "../security/secrets-health";
|
||||
import { getSecurityMetricsSnapshot } from "../security/security-observability";
|
||||
|
||||
// ============================================================================
|
||||
// System Health
|
||||
|
|
@ -34,15 +34,15 @@ import { getSecurityMetricsSnapshot } from '../security/security-observability'
|
|||
* CPU usage is calculated by sampling /proc/stat twice with 100ms delay.
|
||||
*/
|
||||
export async function checkSystemHealth(): Promise<SystemHealth> {
|
||||
const cpuUsagePercent = await getCpuUsage()
|
||||
const cpuUsagePercent = await getCpuUsage();
|
||||
|
||||
const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024)
|
||||
const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024)
|
||||
const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100)
|
||||
const memoryTotalMB = Math.round(os.totalmem() / 1024 / 1024);
|
||||
const memoryUsedMB = Math.round((os.totalmem() - os.freemem()) / 1024 / 1024);
|
||||
const memoryUsagePercent = roundToOneDecimal((memoryUsedMB / memoryTotalMB) * 100);
|
||||
|
||||
const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage()
|
||||
const { diskUsedGB, diskTotalGB, diskUsagePercent } = getDiskUsage();
|
||||
|
||||
const [loadAvg1, loadAvg5] = os.loadavg()
|
||||
const [loadAvg1, loadAvg5] = os.loadavg();
|
||||
|
||||
return {
|
||||
cpuUsagePercent: roundToOneDecimal(cpuUsagePercent),
|
||||
|
|
@ -55,7 +55,7 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
|
|||
loadAvg1: roundToTwoDecimals(loadAvg1),
|
||||
loadAvg5: roundToTwoDecimals(loadAvg5),
|
||||
uptime: Math.round(os.uptime()),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
|
@ -64,214 +64,212 @@ export async function checkSystemHealth(): Promise<SystemHealth> {
|
|||
|
||||
export async function checkRedis(): Promise<RedisStatus> {
|
||||
const offlineStatus: RedisStatus = {
|
||||
status: 'offline',
|
||||
status: "offline",
|
||||
memoryUsedMB: 0,
|
||||
connectedClients: 0,
|
||||
opsPerSec: 0,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const { getRedisClient } = await import('../redis.js')
|
||||
const client = getRedisClient()
|
||||
if (!client) return offlineStatus
|
||||
const { getRedisClient } = await import("../redis.js");
|
||||
const client = getRedisClient();
|
||||
if (!client) return offlineStatus;
|
||||
|
||||
const info = await client.info()
|
||||
const info = await client.info();
|
||||
const getVal = (key: string): number => {
|
||||
const match = info.match(new RegExp(`${key}:(\\d+)`))
|
||||
return match ? parseInt(match[1], 10) : 0
|
||||
}
|
||||
const match = info.match(new RegExp(`${key}:(\\d+)`));
|
||||
return match ? parseInt(match[1], 10) : 0;
|
||||
};
|
||||
|
||||
return {
|
||||
status: 'online',
|
||||
memoryUsedMB: Math.round(getVal('used_memory') / 1024 / 1024),
|
||||
connectedClients: getVal('connected_clients'),
|
||||
opsPerSec: getVal('instantaneous_ops_per_sec'),
|
||||
}
|
||||
status: "online",
|
||||
memoryUsedMB: Math.round(getVal("used_memory") / 1024 / 1024),
|
||||
connectedClients: getVal("connected_clients"),
|
||||
opsPerSec: getVal("instantaneous_ops_per_sec"),
|
||||
};
|
||||
} catch {
|
||||
return offlineStatus
|
||||
return offlineStatus;
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkPostgresql(): Promise<PostgresqlStatus> {
|
||||
const offlineStatus: PostgresqlStatus = {
|
||||
status: 'offline',
|
||||
status: "offline",
|
||||
connections: 0,
|
||||
maxConnections: 50,
|
||||
latencyMs: -1,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const { getPayload } = await import('payload')
|
||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
||||
const { getPayload } = await import("payload");
|
||||
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||
|
||||
const start = Date.now()
|
||||
await payload.find({ collection: 'users', limit: 0 })
|
||||
const latencyMs = Date.now() - start
|
||||
const start = Date.now();
|
||||
await payload.find({ collection: "users", limit: 0 });
|
||||
const latencyMs = Date.now() - start;
|
||||
|
||||
let connections = 0
|
||||
let maxConnections = 50
|
||||
let connections = 0;
|
||||
let maxConnections = 50;
|
||||
try {
|
||||
const connResult = runPsql(
|
||||
'-h 10.10.181.101 -U payload -d payload_db -t -c "SELECT count(*) FROM pg_stat_activity WHERE datname = \'payload_db\'"',
|
||||
)
|
||||
connections = parseInt(connResult.trim(), 10) || 0
|
||||
"-h 10.10.181.101 -U payload -d payload_db -t -c \"SELECT count(*) FROM pg_stat_activity WHERE datname = 'payload_db'\"",
|
||||
);
|
||||
connections = parseInt(connResult.trim(), 10) || 0;
|
||||
|
||||
const maxResult = runPsql(
|
||||
'-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"',
|
||||
)
|
||||
maxConnections = parseInt(maxResult.trim(), 10) || 50
|
||||
const maxResult = runPsql('-h 10.10.181.101 -U payload -d payload_db -t -c "SHOW max_connections"');
|
||||
maxConnections = parseInt(maxResult.trim(), 10) || 50;
|
||||
} catch {
|
||||
// psql unavailable -- latency check already proves connectivity
|
||||
}
|
||||
|
||||
return {
|
||||
status: latencyMs < 1000 ? 'online' : 'warning',
|
||||
status: latencyMs < 1000 ? "online" : "warning",
|
||||
connections,
|
||||
maxConnections,
|
||||
latencyMs,
|
||||
}
|
||||
};
|
||||
} catch {
|
||||
return offlineStatus
|
||||
return offlineStatus;
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkPgBouncer(): Promise<PgBouncerStatus> {
|
||||
const offlineStatus: PgBouncerStatus = {
|
||||
status: 'offline',
|
||||
status: "offline",
|
||||
activeConnections: 0,
|
||||
waitingClients: 0,
|
||||
poolSize: 0,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"')
|
||||
const output = runPsql('-h 127.0.0.1 -p 6432 -U payload -d pgbouncer -t -c "SHOW POOLS"');
|
||||
|
||||
// SHOW POOLS columns: database | user | cl_active | cl_waiting | sv_active | sv_idle | pool_size | ...
|
||||
const lines = output
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter((l) => l.includes('payload'))
|
||||
.split("\n")
|
||||
.filter((l) => l.includes("payload"));
|
||||
|
||||
let activeConnections = 0
|
||||
let waitingClients = 0
|
||||
let poolSize = 20
|
||||
let activeConnections = 0;
|
||||
let waitingClients = 0;
|
||||
let poolSize = 20;
|
||||
|
||||
for (const line of lines) {
|
||||
const parts = line.split('|').map((s) => s.trim())
|
||||
activeConnections += parseInt(parts[2], 10) || 0
|
||||
waitingClients += parseInt(parts[3], 10) || 0
|
||||
poolSize = parseInt(parts[6], 10) || 20
|
||||
const parts = line.split("|").map((s) => s.trim());
|
||||
activeConnections += parseInt(parts[2], 10) || 0;
|
||||
waitingClients += parseInt(parts[3], 10) || 0;
|
||||
poolSize = parseInt(parts[6], 10) || 20;
|
||||
}
|
||||
|
||||
return { status: 'online', activeConnections, waitingClients, poolSize }
|
||||
return { status: "online", activeConnections, waitingClients, poolSize };
|
||||
} catch {
|
||||
return offlineStatus
|
||||
return offlineStatus;
|
||||
}
|
||||
}
|
||||
|
||||
export interface QueueCounts {
|
||||
waiting: number
|
||||
active: number
|
||||
completed: number
|
||||
failed: number
|
||||
waiting: number;
|
||||
active: number;
|
||||
completed: number;
|
||||
failed: number;
|
||||
}
|
||||
|
||||
export async function checkQueues(): Promise<Record<string, QueueCounts>> {
|
||||
try {
|
||||
const { Queue } = await import('bullmq')
|
||||
const { getQueueRedisConnection } = await import('../queue/queue-service.js')
|
||||
const { Queue } = await import("bullmq");
|
||||
const { getQueueRedisConnection } = await import("../queue/queue-service.js");
|
||||
|
||||
const connection = getQueueRedisConnection()
|
||||
const connection = getQueueRedisConnection();
|
||||
// Queue names matching QUEUE_NAMES in queue-service.ts
|
||||
const queueNames = ['email', 'pdf', 'cleanup', 'youtube-upload']
|
||||
const results: Record<string, QueueCounts> = {}
|
||||
const queueNames = ["email", "pdf", "cleanup", "youtube-upload"];
|
||||
const results: Record<string, QueueCounts> = {};
|
||||
|
||||
for (const name of queueNames) {
|
||||
try {
|
||||
const queue = new Queue(name, { connection })
|
||||
const counts = await queue.getJobCounts()
|
||||
const queue = new Queue(name, { connection });
|
||||
const counts = await queue.getJobCounts();
|
||||
results[name] = {
|
||||
waiting: counts.waiting || 0,
|
||||
active: counts.active || 0,
|
||||
completed: counts.completed || 0,
|
||||
failed: counts.failed || 0,
|
||||
}
|
||||
await queue.close()
|
||||
};
|
||||
await queue.close();
|
||||
} catch {
|
||||
results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 }
|
||||
results[name] = { waiting: 0, active: 0, completed: 0, failed: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
return results;
|
||||
} catch {
|
||||
return {}
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkSmtp(): Promise<SmtpStatus> {
|
||||
const now = new Date().toISOString()
|
||||
const now = new Date().toISOString();
|
||||
|
||||
try {
|
||||
const nodemailer = await import('nodemailer')
|
||||
const nodemailer = await import("nodemailer");
|
||||
const transporter = nodemailer.createTransport({
|
||||
host: process.env.SMTP_HOST,
|
||||
port: parseInt(process.env.SMTP_PORT || '587', 10),
|
||||
secure: process.env.SMTP_SECURE === 'true',
|
||||
port: parseInt(process.env.SMTP_PORT || "587", 10),
|
||||
secure: process.env.SMTP_SECURE === "true",
|
||||
auth: {
|
||||
user: process.env.SMTP_USER,
|
||||
pass: process.env.SMTP_PASS,
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
const start = Date.now()
|
||||
await transporter.verify()
|
||||
const responseTimeMs = Date.now() - start
|
||||
const start = Date.now();
|
||||
await transporter.verify();
|
||||
const responseTimeMs = Date.now() - start;
|
||||
|
||||
return { status: 'online', lastCheck: now, responseTimeMs }
|
||||
return { status: "online", lastCheck: now, responseTimeMs };
|
||||
} catch {
|
||||
return { status: 'offline', lastCheck: now, responseTimeMs: -1 }
|
||||
return { status: "offline", lastCheck: now, responseTimeMs: -1 };
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkOAuthTokens(): Promise<{
|
||||
metaOAuth: OAuthTokenStatus
|
||||
youtubeOAuth: OAuthTokenStatus
|
||||
metaOAuth: OAuthTokenStatus;
|
||||
youtubeOAuth: OAuthTokenStatus;
|
||||
}> {
|
||||
const errorStatus: OAuthTokenStatus = {
|
||||
status: 'error',
|
||||
status: "error",
|
||||
tokensTotal: 0,
|
||||
tokensExpiringSoon: 0,
|
||||
tokensExpired: 0,
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const { getPayload } = await import('payload')
|
||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
||||
const { getPayload } = await import("payload");
|
||||
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||
|
||||
const accounts = await payload.find({
|
||||
collection: 'social-accounts',
|
||||
collection: "social-accounts",
|
||||
limit: 100,
|
||||
where: { status: { equals: 'connected' } },
|
||||
})
|
||||
where: { status: { equals: "connected" } },
|
||||
});
|
||||
|
||||
const sevenDaysFromNow = new Date()
|
||||
sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7)
|
||||
const now = new Date()
|
||||
const sevenDaysFromNow = new Date();
|
||||
sevenDaysFromNow.setDate(sevenDaysFromNow.getDate() + 7);
|
||||
const now = new Date();
|
||||
|
||||
const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }
|
||||
const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 }
|
||||
const meta = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
|
||||
const youtube = { tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 };
|
||||
|
||||
for (const account of accounts.docs) {
|
||||
const doc = account as unknown as Record<string, unknown>
|
||||
const target = doc.platform === 'youtube' ? youtube : meta
|
||||
target.tokensTotal++
|
||||
const doc = account as unknown as Record<string, unknown>;
|
||||
const target = doc.platform === "youtube" ? youtube : meta;
|
||||
target.tokensTotal++;
|
||||
|
||||
const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null
|
||||
const expiresAt = doc.tokenExpiresAt ? new Date(doc.tokenExpiresAt as string) : null;
|
||||
if (expiresAt) {
|
||||
if (expiresAt < now) {
|
||||
target.tokensExpired++
|
||||
target.tokensExpired++;
|
||||
} else if (expiresAt < sevenDaysFromNow) {
|
||||
target.tokensExpiringSoon++
|
||||
target.tokensExpiringSoon++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -279,58 +277,55 @@ export async function checkOAuthTokens(): Promise<{
|
|||
return {
|
||||
metaOAuth: { status: getOAuthStatus(meta), ...meta },
|
||||
youtubeOAuth: { status: getOAuthStatus(youtube), ...youtube },
|
||||
}
|
||||
};
|
||||
} catch {
|
||||
return { metaOAuth: errorStatus, youtubeOAuth: errorStatus }
|
||||
return { metaOAuth: errorStatus, youtubeOAuth: errorStatus };
|
||||
}
|
||||
}
|
||||
|
||||
export async function checkCronJobs(): Promise<CronStatuses> {
|
||||
const unknownStatus: CronJobStatus = { lastRun: '', status: 'unknown' }
|
||||
const unknownStatus: CronJobStatus = { lastRun: "", status: "unknown" };
|
||||
|
||||
try {
|
||||
const { getPayload } = await import('payload')
|
||||
const payload = await getPayload({ config: (await import('@payload-config')).default })
|
||||
const { getPayload } = await import("payload");
|
||||
const payload = await getPayload({ config: (await import("@payload-config")).default });
|
||||
|
||||
async function checkCron(source: string): Promise<CronJobStatus> {
|
||||
try {
|
||||
const logs = await payload.find({
|
||||
collection: 'monitoring-logs',
|
||||
collection: "monitoring-logs",
|
||||
limit: 1,
|
||||
sort: '-createdAt',
|
||||
sort: "-createdAt",
|
||||
where: {
|
||||
and: [
|
||||
{ source: { equals: 'cron' } },
|
||||
{ message: { contains: source } },
|
||||
],
|
||||
and: [{ source: { equals: "cron" } }, { message: { contains: source } }],
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
if (logs.docs.length === 0) return unknownStatus
|
||||
if (logs.docs.length === 0) return unknownStatus;
|
||||
|
||||
const doc = logs.docs[0] as unknown as Record<string, unknown>
|
||||
const doc = logs.docs[0] as unknown as Record<string, unknown>;
|
||||
return {
|
||||
lastRun: doc.createdAt as string,
|
||||
status: doc.level === 'error' ? 'failed' : 'ok',
|
||||
}
|
||||
status: doc.level === "error" ? "failed" : "ok",
|
||||
};
|
||||
} catch {
|
||||
return unknownStatus
|
||||
return unknownStatus;
|
||||
}
|
||||
}
|
||||
|
||||
const [communitySync, tokenRefresh, youtubeSync] = await Promise.all([
|
||||
checkCron('community-sync'),
|
||||
checkCron('token-refresh'),
|
||||
checkCron('youtube'),
|
||||
])
|
||||
checkCron("community-sync"),
|
||||
checkCron("token-refresh"),
|
||||
checkCron("youtube"),
|
||||
]);
|
||||
|
||||
return { communitySync, tokenRefresh, youtubeSync }
|
||||
return { communitySync, tokenRefresh, youtubeSync };
|
||||
} catch {
|
||||
return {
|
||||
communitySync: unknownStatus,
|
||||
tokenRefresh: unknownStatus,
|
||||
youtubeSync: unknownStatus,
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -342,18 +337,19 @@ export async function checkCronJobs(): Promise<CronStatuses> {
|
|||
* Collects all monitoring metrics in parallel. Individual check failures
|
||||
* are isolated and return safe defaults instead of failing the whole collection.
|
||||
*/
|
||||
export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>> {
|
||||
const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] = await Promise.allSettled([
|
||||
checkSystemHealth(),
|
||||
checkRedis(),
|
||||
checkPostgresql(),
|
||||
checkPgBouncer(),
|
||||
checkSmtp(),
|
||||
checkOAuthTokens(),
|
||||
checkCronJobs(),
|
||||
Promise.resolve(checkSecretsHealth()),
|
||||
Promise.resolve(getSecurityMetricsSnapshot()),
|
||||
])
|
||||
export async function collectMetrics(): Promise<Omit<SystemMetrics, "timestamp">> {
|
||||
const [system, redis, postgresql, pgbouncer, smtp, oauth, cronJobs, secrets, securityEvents] =
|
||||
await Promise.allSettled([
|
||||
checkSystemHealth(),
|
||||
checkRedis(),
|
||||
checkPostgresql(),
|
||||
checkPgBouncer(),
|
||||
checkSmtp(),
|
||||
checkOAuthTokens(),
|
||||
checkCronJobs(),
|
||||
Promise.resolve(checkSecretsHealth()),
|
||||
Promise.resolve(getSecurityMetricsSnapshot()),
|
||||
]);
|
||||
|
||||
// Load performance tracker lazily to avoid circular dependencies
|
||||
let performance: PerformanceMetrics = {
|
||||
|
|
@ -362,51 +358,51 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
|||
p99ResponseTimeMs: 0,
|
||||
errorRate: 0,
|
||||
requestsPerMinute: 0,
|
||||
}
|
||||
};
|
||||
try {
|
||||
// Dynamic path constructed at runtime to avoid Vite static analysis
|
||||
// when performance-tracker module has not been created yet
|
||||
const trackerPath = './performance-tracker'
|
||||
const mod = await import(/* @vite-ignore */ trackerPath)
|
||||
performance = mod.performanceTracker.getMetrics('1h')
|
||||
const trackerPath = "./performance-tracker";
|
||||
const mod = await import(/* @vite-ignore */ trackerPath);
|
||||
performance = mod.performanceTracker.getMetrics("1h");
|
||||
} catch {
|
||||
// Performance tracker not yet initialized
|
||||
}
|
||||
|
||||
const defaultProcess: ProcessStatus = {
|
||||
status: 'offline',
|
||||
status: "offline",
|
||||
pid: 0,
|
||||
memoryMB: 0,
|
||||
uptimeSeconds: 0,
|
||||
restarts: 0,
|
||||
}
|
||||
};
|
||||
|
||||
const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess)
|
||||
const { payloadProcess, queueWorkerProcess } = getPm2Processes(defaultProcess);
|
||||
|
||||
const oauthDefaults = {
|
||||
metaOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||
youtubeOAuth: { status: 'error' as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||
}
|
||||
metaOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||
youtubeOAuth: { status: "error" as const, tokensTotal: 0, tokensExpiringSoon: 0, tokensExpired: 0 },
|
||||
};
|
||||
|
||||
const cronDefaults: CronStatuses = {
|
||||
communitySync: { lastRun: '', status: 'unknown' },
|
||||
tokenRefresh: { lastRun: '', status: 'unknown' },
|
||||
youtubeSync: { lastRun: '', status: 'unknown' },
|
||||
}
|
||||
communitySync: { lastRun: "", status: "unknown" },
|
||||
tokenRefresh: { lastRun: "", status: "unknown" },
|
||||
youtubeSync: { lastRun: "", status: "unknown" },
|
||||
};
|
||||
|
||||
const secretsDefaults: SecretsHealthStatus = {
|
||||
status: 'critical',
|
||||
status: "critical",
|
||||
checkedAt: new Date().toISOString(),
|
||||
missing: [],
|
||||
expiringSoon: [],
|
||||
expired: [],
|
||||
rotationOverdue: [],
|
||||
}
|
||||
};
|
||||
|
||||
const securityEventsDefaults: SecurityMetricsStatus = {
|
||||
windowMs: 300000,
|
||||
counters: [],
|
||||
}
|
||||
};
|
||||
|
||||
const systemDefaults: SystemHealth = {
|
||||
cpuUsagePercent: 0,
|
||||
|
|
@ -419,21 +415,21 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
|||
loadAvg1: 0,
|
||||
loadAvg5: 0,
|
||||
uptime: 0,
|
||||
}
|
||||
};
|
||||
|
||||
const oauthResult = settled(oauth, oauthDefaults)
|
||||
const oauthResult = settled(oauth, oauthDefaults);
|
||||
|
||||
return {
|
||||
system: settled(system, systemDefaults),
|
||||
services: {
|
||||
payload: payloadProcess,
|
||||
queueWorker: queueWorkerProcess,
|
||||
postgresql: settled(postgresql, { status: 'offline', connections: 0, maxConnections: 50, latencyMs: -1 }),
|
||||
pgbouncer: settled(pgbouncer, { status: 'offline', activeConnections: 0, waitingClients: 0, poolSize: 0 }),
|
||||
redis: settled(redis, { status: 'offline', memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }),
|
||||
postgresql: settled(postgresql, { status: "offline", connections: 0, maxConnections: 50, latencyMs: -1 }),
|
||||
pgbouncer: settled(pgbouncer, { status: "offline", activeConnections: 0, waitingClients: 0, poolSize: 0 }),
|
||||
redis: settled(redis, { status: "offline", memoryUsedMB: 0, connectedClients: 0, opsPerSec: 0 }),
|
||||
},
|
||||
external: {
|
||||
smtp: settled(smtp, { status: 'offline', lastCheck: new Date().toISOString(), responseTimeMs: -1 }),
|
||||
smtp: settled(smtp, { status: "offline", lastCheck: new Date().toISOString(), responseTimeMs: -1 }),
|
||||
metaOAuth: oauthResult.metaOAuth,
|
||||
youtubeOAuth: oauthResult.youtubeOAuth,
|
||||
cronJobs: settled(cronJobs, cronDefaults),
|
||||
|
|
@ -441,7 +437,7 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
|||
securityEvents: settled(securityEvents, securityEventsDefaults),
|
||||
},
|
||||
performance,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
|
@ -454,18 +450,18 @@ export async function collectMetrics(): Promise<Omit<SystemMetrics, 'timestamp'>
|
|||
*/
|
||||
function runPsql(args: string): string {
|
||||
return execSync(`psql ${args}`, {
|
||||
encoding: 'utf-8',
|
||||
encoding: "utf-8",
|
||||
timeout: 5000,
|
||||
env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || '' },
|
||||
})
|
||||
env: { ...process.env, PGPASSWORD: process.env.DB_PASSWORD || "" },
|
||||
});
|
||||
}
|
||||
|
||||
function roundToOneDecimal(value: number): number {
|
||||
return Math.round(value * 10) / 10
|
||||
return Math.round(value * 10) / 10;
|
||||
}
|
||||
|
||||
function roundToTwoDecimals(value: number): number {
|
||||
return Math.round(value * 100) / 100
|
||||
return Math.round(value * 100) / 100;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -473,99 +469,95 @@ function roundToTwoDecimals(value: number): number {
|
|||
* the fallback when the promise was rejected.
|
||||
*/
|
||||
function settled<T>(result: PromiseSettledResult<T>, fallback: T): T {
|
||||
return result.status === 'fulfilled' ? result.value : fallback
|
||||
return result.status === "fulfilled" ? result.value : fallback;
|
||||
}
|
||||
|
||||
async function getCpuUsage(): Promise<number> {
|
||||
try {
|
||||
const fs = await import('node:fs/promises')
|
||||
const stat1 = await fs.readFile('/proc/stat', 'utf-8')
|
||||
await new Promise((resolve) => setTimeout(resolve, 100))
|
||||
const stat2 = await fs.readFile('/proc/stat', 'utf-8')
|
||||
const fs = await import("node:fs/promises");
|
||||
const stat1 = await fs.readFile("/proc/stat", "utf-8");
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
const stat2 = await fs.readFile("/proc/stat", "utf-8");
|
||||
|
||||
const parse = (data: string): { idle: number; total: number } => {
|
||||
const line = data.split('\n')[0] // first line: cpu user nice system idle ...
|
||||
const parts = line.split(/\s+/).slice(1).map(Number)
|
||||
const idle = parts[3] + (parts[4] || 0) // idle + iowait
|
||||
const total = parts.reduce((a, b) => a + b, 0)
|
||||
return { idle, total }
|
||||
}
|
||||
const line = data.split("\n")[0]; // first line: cpu user nice system idle ...
|
||||
const parts = line.split(/\s+/).slice(1).map(Number);
|
||||
const idle = parts[3] + (parts[4] || 0); // idle + iowait
|
||||
const total = parts.reduce((a, b) => a + b, 0);
|
||||
return { idle, total };
|
||||
};
|
||||
|
||||
const s1 = parse(stat1)
|
||||
const s2 = parse(stat2)
|
||||
const idleDiff = s2.idle - s1.idle
|
||||
const totalDiff = s2.total - s1.total
|
||||
const s1 = parse(stat1);
|
||||
const s2 = parse(stat2);
|
||||
const idleDiff = s2.idle - s1.idle;
|
||||
const totalDiff = s2.total - s1.total;
|
||||
|
||||
if (totalDiff === 0) return 0
|
||||
return ((totalDiff - idleDiff) / totalDiff) * 100
|
||||
if (totalDiff === 0) return 0;
|
||||
return ((totalDiff - idleDiff) / totalDiff) * 100;
|
||||
} catch {
|
||||
// Fallback if /proc/stat is unavailable
|
||||
const cpuCount = os.cpus().length
|
||||
return (os.loadavg()[0] / cpuCount) * 100
|
||||
const cpuCount = os.cpus().length;
|
||||
return (os.loadavg()[0] / cpuCount) * 100;
|
||||
}
|
||||
}
|
||||
|
||||
function getDiskUsage(): { diskUsedGB: number; diskTotalGB: number; diskUsagePercent: number } {
|
||||
try {
|
||||
const output = execSync('df -B1 / | tail -1', { encoding: 'utf-8' })
|
||||
const parts = output.trim().split(/\s+/)
|
||||
const output = execSync("df -B1 / | tail -1", { encoding: "utf-8" });
|
||||
const parts = output.trim().split(/\s+/);
|
||||
// Format: filesystem 1B-blocks used available use% mountpoint
|
||||
const total = parseInt(parts[1], 10)
|
||||
const used = parseInt(parts[2], 10)
|
||||
const total = parseInt(parts[1], 10);
|
||||
const used = parseInt(parts[2], 10);
|
||||
return {
|
||||
diskTotalGB: roundToOneDecimal(total / 1024 / 1024 / 1024),
|
||||
diskUsedGB: roundToOneDecimal(used / 1024 / 1024 / 1024),
|
||||
diskUsagePercent: roundToOneDecimal((used / total) * 100),
|
||||
}
|
||||
};
|
||||
} catch {
|
||||
return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 }
|
||||
return { diskUsedGB: 0, diskTotalGB: 0, diskUsagePercent: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
function getOAuthStatus(
|
||||
counts: { tokensExpired: number; tokensExpiringSoon: number },
|
||||
): OAuthTokenStatus['status'] {
|
||||
if (counts.tokensExpired > 0) return 'expired'
|
||||
if (counts.tokensExpiringSoon > 0) return 'expiring_soon'
|
||||
return 'ok'
|
||||
function getOAuthStatus(counts: { tokensExpired: number; tokensExpiringSoon: number }): OAuthTokenStatus["status"] {
|
||||
if (counts.tokensExpired > 0) return "expired";
|
||||
if (counts.tokensExpiringSoon > 0) return "expiring_soon";
|
||||
return "ok";
|
||||
}
|
||||
|
||||
interface Pm2Processes {
|
||||
payloadProcess: ProcessStatus
|
||||
queueWorkerProcess: ProcessStatus
|
||||
payloadProcess: ProcessStatus;
|
||||
queueWorkerProcess: ProcessStatus;
|
||||
}
|
||||
|
||||
function getPm2Processes(defaultProcess: ProcessStatus): Pm2Processes {
|
||||
let payloadProcess = defaultProcess
|
||||
let queueWorkerProcess = defaultProcess
|
||||
let payloadProcess = defaultProcess;
|
||||
let queueWorkerProcess = defaultProcess;
|
||||
|
||||
try {
|
||||
const pm2Out = execSync('pm2 jlist', { encoding: 'utf-8', timeout: 5000 })
|
||||
const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>>
|
||||
const pm2Out = execSync("pm2 jlist", { encoding: "utf-8", timeout: 5000 });
|
||||
const pm2List = JSON.parse(pm2Out) as Array<Record<string, unknown>>;
|
||||
|
||||
for (const proc of pm2List) {
|
||||
const env = proc.pm2_env as Record<string, unknown> | undefined
|
||||
const monit = proc.monit as Record<string, number> | undefined
|
||||
const env = proc.pm2_env as Record<string, unknown> | undefined;
|
||||
const monit = proc.monit as Record<string, number> | undefined;
|
||||
|
||||
const info: ProcessStatus = {
|
||||
status: env?.status === 'online' ? 'online' : 'offline',
|
||||
status: env?.status === "online" ? "online" : "offline",
|
||||
pid: (proc.pid as number) || 0,
|
||||
memoryMB: Math.round((monit?.memory || 0) / 1024 / 1024),
|
||||
uptimeSeconds: env?.pm_uptime
|
||||
? Math.round((Date.now() - (env.pm_uptime as number)) / 1000)
|
||||
: 0,
|
||||
uptimeSeconds: env?.pm_uptime ? Math.round((Date.now() - (env.pm_uptime as number)) / 1000) : 0,
|
||||
restarts: (env?.restart_time as number) || 0,
|
||||
}
|
||||
};
|
||||
|
||||
if (proc.name === 'payload') {
|
||||
payloadProcess = info
|
||||
} else if (proc.name === 'queue-worker') {
|
||||
queueWorkerProcess = info
|
||||
if (proc.name === "payload") {
|
||||
payloadProcess = info;
|
||||
} else if (proc.name === "queue-worker") {
|
||||
queueWorkerProcess = info;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// PM2 not available
|
||||
}
|
||||
|
||||
return { payloadProcess, queueWorkerProcess }
|
||||
return { payloadProcess, queueWorkerProcess };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,14 +7,14 @@
|
|||
* error rates, and throughput.
|
||||
*/
|
||||
|
||||
import type { PerformanceEntry, PerformanceMetrics } from './types'
|
||||
import type { PerformanceEntry, PerformanceMetrics } from "./types";
|
||||
|
||||
const PERIOD_MS: Record<string, number> = {
|
||||
'1h': 3_600_000,
|
||||
'6h': 21_600_000,
|
||||
'24h': 86_400_000,
|
||||
'7d': 604_800_000,
|
||||
}
|
||||
"1h": 3_600_000,
|
||||
"6h": 21_600_000,
|
||||
"24h": 86_400_000,
|
||||
"7d": 604_800_000,
|
||||
};
|
||||
|
||||
const EMPTY_METRICS: PerformanceMetrics = {
|
||||
avgResponseTimeMs: 0,
|
||||
|
|
@ -22,17 +22,17 @@ const EMPTY_METRICS: PerformanceMetrics = {
|
|||
p99ResponseTimeMs: 0,
|
||||
errorRate: 0,
|
||||
requestsPerMinute: 0,
|
||||
}
|
||||
};
|
||||
|
||||
export class PerformanceTracker {
|
||||
private readonly buffer: PerformanceEntry[]
|
||||
private pointer: number = 0
|
||||
private count: number = 0
|
||||
private readonly capacity: number
|
||||
private readonly buffer: PerformanceEntry[];
|
||||
private pointer: number = 0;
|
||||
private count: number = 0;
|
||||
private readonly capacity: number;
|
||||
|
||||
constructor(capacity: number = 10_000) {
|
||||
this.capacity = capacity
|
||||
this.buffer = new Array(capacity)
|
||||
this.capacity = capacity;
|
||||
this.buffer = new Array(capacity);
|
||||
}
|
||||
|
||||
track(method: string, path: string, statusCode: number, durationMs: number): void {
|
||||
|
|
@ -42,40 +42,40 @@ export class PerformanceTracker {
|
|||
path,
|
||||
statusCode,
|
||||
durationMs,
|
||||
}
|
||||
this.pointer = (this.pointer + 1) % this.capacity
|
||||
};
|
||||
this.pointer = (this.pointer + 1) % this.capacity;
|
||||
if (this.count < this.capacity) {
|
||||
this.count++
|
||||
this.count++;
|
||||
}
|
||||
}
|
||||
|
||||
getMetrics(period: '1h' | '6h' | '24h' | '7d' = '1h'): PerformanceMetrics {
|
||||
const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS['1h'])
|
||||
getMetrics(period: "1h" | "6h" | "24h" | "7d" = "1h"): PerformanceMetrics {
|
||||
const cutoff = Date.now() - (PERIOD_MS[period] ?? PERIOD_MS["1h"]);
|
||||
|
||||
const entries: PerformanceEntry[] = []
|
||||
const entries: PerformanceEntry[] = [];
|
||||
for (let i = 0; i < this.count; i++) {
|
||||
const entry = this.buffer[i]
|
||||
const entry = this.buffer[i];
|
||||
if (entry && entry.timestamp >= cutoff) {
|
||||
entries.push(entry)
|
||||
entries.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
if (entries.length === 0) {
|
||||
return { ...EMPTY_METRICS }
|
||||
return { ...EMPTY_METRICS };
|
||||
}
|
||||
|
||||
const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b)
|
||||
const durations = entries.map((e) => e.durationMs).sort((a, b) => a - b);
|
||||
|
||||
const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length
|
||||
const p95 = percentile(durations, 0.95)
|
||||
const p99 = percentile(durations, 0.99)
|
||||
const avg = durations.reduce((sum, d) => sum + d, 0) / durations.length;
|
||||
const p95 = percentile(durations, 0.95);
|
||||
const p99 = percentile(durations, 0.99);
|
||||
|
||||
const errorCount = entries.filter((e) => e.statusCode >= 500).length
|
||||
const errorRate = errorCount / entries.length
|
||||
const errorCount = entries.filter((e) => e.statusCode >= 500).length;
|
||||
const errorRate = errorCount / entries.length;
|
||||
|
||||
const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp))
|
||||
const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1)
|
||||
const requestsPerMinute = entries.length / windowMinutes
|
||||
const earliestTimestamp = Math.min(...entries.map((e) => e.timestamp));
|
||||
const windowMinutes = Math.max((Date.now() - earliestTimestamp) / 60_000, 1);
|
||||
const requestsPerMinute = entries.length / windowMinutes;
|
||||
|
||||
return {
|
||||
avgResponseTimeMs: Math.round(avg),
|
||||
|
|
@ -83,14 +83,14 @@ export class PerformanceTracker {
|
|||
p99ResponseTimeMs: p99,
|
||||
errorRate: Math.round(errorRate * 1000) / 1000,
|
||||
requestsPerMinute: Math.round(requestsPerMinute * 10) / 10,
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function percentile(sorted: number[], p: number): number {
|
||||
const index = Math.floor(sorted.length * p)
|
||||
return sorted[Math.min(index, sorted.length - 1)]
|
||||
const index = Math.floor(sorted.length * p);
|
||||
return sorted[Math.min(index, sorted.length - 1)];
|
||||
}
|
||||
|
||||
/** Singleton instance used across the application. */
|
||||
export const performanceTracker = new PerformanceTracker(10_000)
|
||||
export const performanceTracker = new PerformanceTracker(10_000);
|
||||
|
|
|
|||
|
|
@ -6,62 +6,62 @@
|
|||
* sie in MonitoringSnapshots. Evaluiert dabei Alert-Regeln.
|
||||
*/
|
||||
|
||||
import { collectMetrics } from './monitoring-service'
|
||||
import { AlertEvaluator } from './alert-evaluator'
|
||||
import { collectMetrics } from "./monitoring-service";
|
||||
import { AlertEvaluator } from "./alert-evaluator";
|
||||
|
||||
let interval: ReturnType<typeof setInterval> | null = null
|
||||
const alertEvaluator = new AlertEvaluator()
|
||||
let interval: ReturnType<typeof setInterval> | null = null;
|
||||
const alertEvaluator = new AlertEvaluator();
|
||||
|
||||
/** Cached Payload instance — resolved once, reused on every tick. */
|
||||
let cachedPayload: any = null
|
||||
let cachedPayload: any = null;
|
||||
|
||||
async function getPayloadInstance(): Promise<any> {
|
||||
if (cachedPayload) return cachedPayload
|
||||
const { getPayload } = await import('payload')
|
||||
const config = (await import(/* @vite-ignore */ '@payload-config')).default
|
||||
cachedPayload = await getPayload({ config })
|
||||
return cachedPayload
|
||||
if (cachedPayload) return cachedPayload;
|
||||
const { getPayload } = await import("payload");
|
||||
const config = (await import(/* @vite-ignore */ "@payload-config")).default;
|
||||
cachedPayload = await getPayload({ config });
|
||||
return cachedPayload;
|
||||
}
|
||||
|
||||
export async function startSnapshotCollector(): Promise<void> {
|
||||
const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || '60000', 10)
|
||||
console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`)
|
||||
const INTERVAL = parseInt(process.env.MONITORING_SNAPSHOT_INTERVAL || "60000", 10);
|
||||
console.log(`[SnapshotCollector] Starting (interval: ${INTERVAL}ms)`);
|
||||
|
||||
// Run immediately once, then on interval
|
||||
await collectAndSave()
|
||||
await collectAndSave();
|
||||
|
||||
interval = setInterval(async () => {
|
||||
await collectAndSave()
|
||||
}, INTERVAL)
|
||||
await collectAndSave();
|
||||
}, INTERVAL);
|
||||
}
|
||||
|
||||
async function collectAndSave(): Promise<void> {
|
||||
try {
|
||||
const payload = await getPayloadInstance()
|
||||
const payload = await getPayloadInstance();
|
||||
|
||||
const metrics = await collectMetrics()
|
||||
const metrics = await collectMetrics();
|
||||
|
||||
await (payload as any).create({
|
||||
collection: 'monitoring-snapshots',
|
||||
collection: "monitoring-snapshots",
|
||||
data: {
|
||||
timestamp: new Date().toISOString(),
|
||||
...metrics,
|
||||
},
|
||||
})
|
||||
});
|
||||
|
||||
// Evaluate alert rules against collected metrics
|
||||
await alertEvaluator.evaluateRules(payload as any, metrics)
|
||||
await alertEvaluator.evaluateRules(payload as any, metrics);
|
||||
} catch (error) {
|
||||
console.error('[SnapshotCollector] Error:', error)
|
||||
console.error("[SnapshotCollector] Error:", error);
|
||||
// Reset cache on error so next tick re-resolves
|
||||
cachedPayload = null
|
||||
cachedPayload = null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function stopSnapshotCollector(): Promise<void> {
|
||||
if (interval) {
|
||||
clearInterval(interval)
|
||||
interval = null
|
||||
clearInterval(interval);
|
||||
interval = null;
|
||||
}
|
||||
console.log('[SnapshotCollector] Stopped')
|
||||
console.log("[SnapshotCollector] Stopped");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,192 +1,185 @@
|
|||
// === System Health ===
|
||||
export interface SystemHealth {
|
||||
cpuUsagePercent: number
|
||||
memoryUsedMB: number
|
||||
memoryTotalMB: number
|
||||
memoryUsagePercent: number
|
||||
diskUsedGB: number
|
||||
diskTotalGB: number
|
||||
diskUsagePercent: number
|
||||
loadAvg1: number
|
||||
loadAvg5: number
|
||||
uptime: number // seconds
|
||||
cpuUsagePercent: number;
|
||||
memoryUsedMB: number;
|
||||
memoryTotalMB: number;
|
||||
memoryUsagePercent: number;
|
||||
diskUsedGB: number;
|
||||
diskTotalGB: number;
|
||||
diskUsagePercent: number;
|
||||
loadAvg1: number;
|
||||
loadAvg5: number;
|
||||
uptime: number; // seconds
|
||||
}
|
||||
|
||||
// === Service Statuses ===
|
||||
export type ServiceStatusType = 'online' | 'warning' | 'offline'
|
||||
export type ServiceStatusType = "online" | "warning" | "offline";
|
||||
|
||||
export interface ProcessStatus {
|
||||
status: ServiceStatusType
|
||||
pid: number
|
||||
memoryMB: number
|
||||
uptimeSeconds: number
|
||||
restarts: number
|
||||
status: ServiceStatusType;
|
||||
pid: number;
|
||||
memoryMB: number;
|
||||
uptimeSeconds: number;
|
||||
restarts: number;
|
||||
}
|
||||
|
||||
export interface PostgresqlStatus {
|
||||
status: ServiceStatusType
|
||||
connections: number
|
||||
maxConnections: number
|
||||
latencyMs: number
|
||||
status: ServiceStatusType;
|
||||
connections: number;
|
||||
maxConnections: number;
|
||||
latencyMs: number;
|
||||
}
|
||||
|
||||
export interface PgBouncerStatus {
|
||||
status: ServiceStatusType
|
||||
activeConnections: number
|
||||
waitingClients: number
|
||||
poolSize: number
|
||||
status: ServiceStatusType;
|
||||
activeConnections: number;
|
||||
waitingClients: number;
|
||||
poolSize: number;
|
||||
}
|
||||
|
||||
export interface RedisStatus {
|
||||
status: ServiceStatusType
|
||||
memoryUsedMB: number
|
||||
connectedClients: number
|
||||
opsPerSec: number
|
||||
status: ServiceStatusType;
|
||||
memoryUsedMB: number;
|
||||
connectedClients: number;
|
||||
opsPerSec: number;
|
||||
}
|
||||
|
||||
export interface ServiceStatuses {
|
||||
payload: ProcessStatus
|
||||
queueWorker: ProcessStatus
|
||||
postgresql: PostgresqlStatus
|
||||
pgbouncer: PgBouncerStatus
|
||||
redis: RedisStatus
|
||||
payload: ProcessStatus;
|
||||
queueWorker: ProcessStatus;
|
||||
postgresql: PostgresqlStatus;
|
||||
pgbouncer: PgBouncerStatus;
|
||||
redis: RedisStatus;
|
||||
}
|
||||
|
||||
// === External Statuses ===
|
||||
export interface SmtpStatus {
|
||||
status: ServiceStatusType
|
||||
lastCheck: string // ISO date
|
||||
responseTimeMs: number
|
||||
status: ServiceStatusType;
|
||||
lastCheck: string; // ISO date
|
||||
responseTimeMs: number;
|
||||
}
|
||||
|
||||
export type OAuthStatusType = 'ok' | 'expiring_soon' | 'expired' | 'error'
|
||||
export type OAuthStatusType = "ok" | "expiring_soon" | "expired" | "error";
|
||||
|
||||
export interface OAuthTokenStatus {
|
||||
status: OAuthStatusType
|
||||
tokensTotal: number
|
||||
tokensExpiringSoon: number
|
||||
tokensExpired: number
|
||||
status: OAuthStatusType;
|
||||
tokensTotal: number;
|
||||
tokensExpiringSoon: number;
|
||||
tokensExpired: number;
|
||||
}
|
||||
|
||||
export interface CronJobStatus {
|
||||
lastRun: string // ISO date
|
||||
status: 'ok' | 'failed' | 'unknown'
|
||||
lastRun: string; // ISO date
|
||||
status: "ok" | "failed" | "unknown";
|
||||
}
|
||||
|
||||
export interface CronStatuses {
|
||||
communitySync: CronJobStatus
|
||||
tokenRefresh: CronJobStatus
|
||||
youtubeSync: CronJobStatus
|
||||
communitySync: CronJobStatus;
|
||||
tokenRefresh: CronJobStatus;
|
||||
youtubeSync: CronJobStatus;
|
||||
}
|
||||
|
||||
export interface SecretExpiringSoon {
|
||||
name: string
|
||||
expiresAt: string
|
||||
daysRemaining: number
|
||||
name: string;
|
||||
expiresAt: string;
|
||||
daysRemaining: number;
|
||||
}
|
||||
|
||||
export interface SecretExpired {
|
||||
name: string
|
||||
expiresAt: string
|
||||
name: string;
|
||||
expiresAt: string;
|
||||
}
|
||||
|
||||
export interface SecretRotationOverdue {
|
||||
name: string
|
||||
rotatedAt: string
|
||||
ageDays: number
|
||||
name: string;
|
||||
rotatedAt: string;
|
||||
ageDays: number;
|
||||
}
|
||||
|
||||
export interface SecretsHealthStatus {
|
||||
status: 'ok' | 'warning' | 'critical'
|
||||
checkedAt: string
|
||||
missing: string[]
|
||||
expiringSoon: SecretExpiringSoon[]
|
||||
expired: SecretExpired[]
|
||||
rotationOverdue: SecretRotationOverdue[]
|
||||
status: "ok" | "warning" | "critical";
|
||||
checkedAt: string;
|
||||
missing: string[];
|
||||
expiringSoon: SecretExpiringSoon[];
|
||||
expired: SecretExpired[];
|
||||
rotationOverdue: SecretRotationOverdue[];
|
||||
}
|
||||
|
||||
export interface SecurityMetricsStatus {
|
||||
windowMs: number
|
||||
windowMs: number;
|
||||
counters: Array<{
|
||||
eventType: string
|
||||
count: number
|
||||
windowStart: string
|
||||
}>
|
||||
eventType: string;
|
||||
count: number;
|
||||
windowStart: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface ExternalStatuses {
|
||||
smtp: SmtpStatus
|
||||
metaOAuth: OAuthTokenStatus
|
||||
youtubeOAuth: OAuthTokenStatus
|
||||
cronJobs: CronStatuses
|
||||
secrets: SecretsHealthStatus
|
||||
securityEvents: SecurityMetricsStatus
|
||||
smtp: SmtpStatus;
|
||||
metaOAuth: OAuthTokenStatus;
|
||||
youtubeOAuth: OAuthTokenStatus;
|
||||
cronJobs: CronStatuses;
|
||||
secrets: SecretsHealthStatus;
|
||||
securityEvents: SecurityMetricsStatus;
|
||||
}
|
||||
|
||||
// === Performance ===
|
||||
export interface PerformanceMetrics {
|
||||
avgResponseTimeMs: number
|
||||
p95ResponseTimeMs: number
|
||||
p99ResponseTimeMs: number
|
||||
errorRate: number // 0-1
|
||||
requestsPerMinute: number
|
||||
avgResponseTimeMs: number;
|
||||
p95ResponseTimeMs: number;
|
||||
p99ResponseTimeMs: number;
|
||||
errorRate: number; // 0-1
|
||||
requestsPerMinute: number;
|
||||
}
|
||||
|
||||
// === Full Snapshot ===
|
||||
export interface SystemMetrics {
|
||||
timestamp: string // ISO date
|
||||
system: SystemHealth
|
||||
services: ServiceStatuses
|
||||
external: ExternalStatuses
|
||||
performance: PerformanceMetrics
|
||||
timestamp: string; // ISO date
|
||||
system: SystemHealth;
|
||||
services: ServiceStatuses;
|
||||
external: ExternalStatuses;
|
||||
performance: PerformanceMetrics;
|
||||
}
|
||||
|
||||
// === SSE Events (discriminated union) ===
|
||||
export type MonitoringEvent =
|
||||
| { type: 'health'; data: SystemHealth }
|
||||
| { type: 'service'; data: Partial<ServiceStatuses> }
|
||||
| { type: 'alert'; data: AlertEvent }
|
||||
| { type: 'log'; data: LogEvent }
|
||||
| { type: 'performance'; data: PerformanceMetrics }
|
||||
| { type: "health"; data: SystemHealth }
|
||||
| { type: "service"; data: Partial<ServiceStatuses> }
|
||||
| { type: "alert"; data: AlertEvent }
|
||||
| { type: "log"; data: LogEvent }
|
||||
| { type: "performance"; data: PerformanceMetrics };
|
||||
|
||||
export interface AlertEvent {
|
||||
id: string
|
||||
ruleId: string
|
||||
metric: string
|
||||
value: number
|
||||
threshold: number
|
||||
severity: AlertSeverity
|
||||
message: string
|
||||
timestamp: string
|
||||
id: string;
|
||||
ruleId: string;
|
||||
metric: string;
|
||||
value: number;
|
||||
threshold: number;
|
||||
severity: AlertSeverity;
|
||||
message: string;
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
export interface LogEvent {
|
||||
id: string
|
||||
level: LogLevel
|
||||
source: LogSource
|
||||
message: string
|
||||
timestamp: string
|
||||
context?: Record<string, unknown>
|
||||
id: string;
|
||||
level: LogLevel;
|
||||
source: LogSource;
|
||||
message: string;
|
||||
timestamp: string;
|
||||
context?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
// === Enums as union types ===
|
||||
export type AlertCondition = 'gt' | 'lt' | 'eq' | 'gte' | 'lte'
|
||||
export type AlertSeverity = 'warning' | 'error' | 'critical'
|
||||
export type LogLevel = 'debug' | 'info' | 'warn' | 'error' | 'fatal'
|
||||
export type LogSource =
|
||||
| 'payload'
|
||||
| 'queue-worker'
|
||||
| 'cron'
|
||||
| 'email'
|
||||
| 'oauth'
|
||||
| 'sync'
|
||||
| 'security'
|
||||
export type AlertCondition = "gt" | "lt" | "eq" | "gte" | "lte";
|
||||
export type AlertSeverity = "warning" | "error" | "critical";
|
||||
export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
|
||||
export type LogSource = "payload" | "queue-worker" | "cron" | "email" | "oauth" | "sync" | "security";
|
||||
|
||||
// === Performance Tracker Entry ===
|
||||
export interface PerformanceEntry {
|
||||
timestamp: number // Date.now()
|
||||
method: string
|
||||
path: string
|
||||
statusCode: number
|
||||
durationMs: number
|
||||
timestamp: number; // Date.now()
|
||||
method: string;
|
||||
path: string;
|
||||
statusCode: number;
|
||||
durationMs: number;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue