"""Data validation functions for ICD codes and KVNR.""" import re ICD_PATTERN = re.compile(r"^[A-Z]\d{2}(\.\d{1,2})?$") KVNR_PATTERN = re.compile(r"^[A-Z]\d{9}$") def validate_icd(code: str) -> str: """Validate and normalize a single ICD code. Returns uppercase stripped code.""" code = code.strip().upper() if not code: raise ValueError("Empty ICD code") if not ICD_PATTERN.match(code): raise ValueError(f"Invalid ICD code format: '{code}'") return code def split_icd_codes(raw: str) -> list[str]: """Split a string of multiple ICD codes (comma or semicolon separated).""" if not raw or not raw.strip(): return [] # Split by comma, semicolon, or whitespace+comma combinations codes = re.split(r"[,;]\s*", raw.strip()) return [c.strip() for c in codes if c.strip()] def normalize_icd_hauptgruppe(code: str) -> str: """Extract hauptgruppe from ICD code: 'C50.1' -> 'C50'.""" code = code.strip().upper() return code[:3] if len(code) >= 3 else code def validate_kvnr(kvnr: str) -> str: """Validate KVNR format (letter + 9 digits). Returns stripped uppercase.""" kvnr = kvnr.strip().upper() if not kvnr: raise ValueError("Empty KVNR") if not KVNR_PATTERN.match(kvnr): raise ValueError(f"Invalid KVNR format: '{kvnr}'") return kvnr