"""CRM CSV parser for DAK Zweitmeinungs-Portal. Parses CRM CSV exports with: - UTF-8-BOM encoding - Comma-delimited columns: Hauptkontakt, Name, Thema, Erstellungsdatum, Modul - Pipe-delimited Hauptkontakt: "Nachname | Vorname | Geburtsdatum | KVNR" - German date formats: DD.MM.YYYY (Geburtsdatum), DD.MM.YY, HH:MM (Erstellungsdatum) - Modul-to-Fallgruppe mapping """ import csv import io import logging from dataclasses import dataclass from datetime import date from typing import Optional from app.utils.fallgruppe_map import map_modul_to_fallgruppe from app.utils.kw_utils import date_to_jahr, date_to_kw, parse_german_date logger = logging.getLogger(__name__) @dataclass class ParsedCase: """A single parsed case from a CRM CSV row.""" nachname: str vorname: Optional[str] geburtsdatum: Optional[date] kvnr: Optional[str] thema: str fallgruppe: str datum: date jahr: int kw: int crm_ticket_id: Optional[str] def parse_hauptkontakt(raw: str) -> dict: """Parse pipe-delimited contact string. Format: "Nachname | Vorname | Geburtsdatum | KVNR" Edge cases handled: - Missing KVNR: "Daum | Luana | 05.02.2016 |" - Missing Geburtsdatum: "Schaumann | Janina | |" - Bad date: "Krölls | Peter | 29.08.0196 | S361390622" - Missing Vorname: "Wuffy | | |" """ parts = [p.strip() for p in raw.split("|")] result = { "nachname": parts[0] if len(parts) > 0 else "", "vorname": parts[1] if len(parts) > 1 and parts[1] else None, "geburtsdatum": None, "kvnr": parts[3] if len(parts) > 3 and parts[3] else None, } if len(parts) > 2 and parts[2]: try: result["geburtsdatum"] = parse_german_date(parts[2]) except (ValueError, Exception) as e: logger.warning("Could not parse Geburtsdatum '%s': %s", parts[2], e) return result def parse_csv(content: bytes, filename: str = "") -> list[ParsedCase]: """Parse CRM CSV file content into list of ParsedCase objects. Args: content: Raw bytes of the CSV file (UTF-8-BOM encoded). filename: Optional filename for logging context. Returns: List of successfully parsed cases. Rows with empty/unmappable Modul are skipped (logged as warnings). Other parse errors are also skipped and logged. """ text = content.decode("utf-8-sig") # Handle BOM reader = csv.DictReader(io.StringIO(text)) cases: list[ParsedCase] = [] errors: list[str] = [] skipped = 0 for i, row in enumerate(reader, start=2): # row 1 is header try: # Parse pipe-delimited contact field kontakt = parse_hauptkontakt(row.get("Hauptkontakt", "")) # Parse creation date datum_str = row.get("Erstellungsdatum", "").strip() if datum_str: datum = parse_german_date(datum_str) else: datum = date.today() # Map Modul to Fallgruppe -- skip rows with empty/unknown Modul modul = row.get("Modul", "").strip() if not modul: skipped += 1 logger.debug( "Skipping row %d in %s: empty Modul field", i, filename ) continue try: fallgruppe = map_modul_to_fallgruppe(modul) except ValueError: skipped += 1 logger.warning( "Skipping row %d in %s: unmappable Modul '%s'", i, filename, modul, ) continue cases.append( ParsedCase( nachname=kontakt["nachname"], vorname=kontakt["vorname"], geburtsdatum=kontakt["geburtsdatum"], kvnr=kontakt["kvnr"], thema=row.get("Thema", "").strip(), fallgruppe=fallgruppe, datum=datum, jahr=date_to_jahr(datum), kw=date_to_kw(datum), crm_ticket_id=row.get("Name", "").strip() or None, ) ) except Exception as e: errors.append(f"Row {i}: {e}") logger.warning("CSV parse error in %s row %d: %s", filename, i, e) logger.info( "CSV parsing of '%s' complete: %d parsed, %d skipped, %d errors", filename, len(cases), skipped, len(errors), ) return cases