feat: CRM CSV parser with pipe-delimited contact parsing

Parse CRM CSV exports (UTF-8-BOM, comma-delimited) with: - Pipe-delimited Hauptkontakt field (Nachname|Vorname|Geburtsdatum|KVNR) - German date formats (DD.MM.YYYY, DD.MM.YY, HH:MM) - Modul-to-Fallgruppe mapping - Graceful handling of missing KVNR, bad dates, empty fields, spam rows - 19 tests (synthetic + all 4 real CSV files) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 17:13:42 +00:00 · 2026-02-24 07:45:34 +00:00 · 2026-02-24 07:45:34 +00:00 · 84d11822e0
commit 84d11822e0
parent 178d40d036
2 changed files with 464 additions and 0 deletions
--- a/backend/app/services/csv_parser.py
+++ b/backend/app/services/csv_parser.py
@ -0,0 +1,143 @@
+"""CRM CSV parser for DAK Zweitmeinungs-Portal.
+
+Parses CRM CSV exports with:
+- UTF-8-BOM encoding
+- Comma-delimited columns: Hauptkontakt, Name, Thema, Erstellungsdatum, Modul
+- Pipe-delimited Hauptkontakt: "Nachname | Vorname | Geburtsdatum | KVNR"
+- German date formats: DD.MM.YYYY (Geburtsdatum), DD.MM.YY, HH:MM (Erstellungsdatum)
+- Modul-to-Fallgruppe mapping
+"""
+
+import csv
+import io
+import logging
+from dataclasses import dataclass
+from datetime import date
+from typing import Optional
+
+from app.utils.fallgruppe_map import map_modul_to_fallgruppe
+from app.utils.kw_utils import date_to_jahr, date_to_kw, parse_german_date
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ParsedCase:
+    """A single parsed case from a CRM CSV row."""
+
+    nachname: str
+    vorname: Optional[str]
+    geburtsdatum: Optional[date]
+    kvnr: Optional[str]
+    thema: str
+    fallgruppe: str
+    datum: date
+    jahr: int
+    kw: int
+    crm_ticket_id: Optional[str]
+
+
+def parse_hauptkontakt(raw: str) -> dict:
+    """Parse pipe-delimited contact string.
+
+    Format: "Nachname | Vorname | Geburtsdatum | KVNR"
+
+    Edge cases handled:
+    - Missing KVNR: "Daum | Luana | 05.02.2016 |"
+    - Missing Geburtsdatum: "Schaumann | Janina |  |"
+    - Bad date: "Krölls | Peter | 29.08.0196 | S361390622"
+    - Missing Vorname: "Wuffy |  |  |"
+    """
+    parts = [p.strip() for p in raw.split("|")]
+    result = {
+        "nachname": parts[0] if len(parts) > 0 else "",
+        "vorname": parts[1] if len(parts) > 1 and parts[1] else None,
+        "geburtsdatum": None,
+        "kvnr": parts[3] if len(parts) > 3 and parts[3] else None,
+    }
+    if len(parts) > 2 and parts[2]:
+        try:
+            result["geburtsdatum"] = parse_german_date(parts[2])
+        except (ValueError, Exception) as e:
+            logger.warning("Could not parse Geburtsdatum '%s': %s", parts[2], e)
+    return result
+
+
+def parse_csv(content: bytes, filename: str = "") -> list[ParsedCase]:
+    """Parse CRM CSV file content into list of ParsedCase objects.
+
+    Args:
+        content: Raw bytes of the CSV file (UTF-8-BOM encoded).
+        filename: Optional filename for logging context.
+
+    Returns:
+        List of successfully parsed cases. Rows with empty/unmappable Modul
+        are skipped (logged as warnings). Other parse errors are also skipped
+        and logged.
+    """
+    text = content.decode("utf-8-sig")  # Handle BOM
+    reader = csv.DictReader(io.StringIO(text))
+    cases: list[ParsedCase] = []
+    errors: list[str] = []
+    skipped = 0
+
+    for i, row in enumerate(reader, start=2):  # row 1 is header
+        try:
+            # Parse pipe-delimited contact field
+            kontakt = parse_hauptkontakt(row.get("Hauptkontakt", ""))
+
+            # Parse creation date
+            datum_str = row.get("Erstellungsdatum", "").strip()
+            if datum_str:
+                datum = parse_german_date(datum_str)
+            else:
+                datum = date.today()
+
+            # Map Modul to Fallgruppe -- skip rows with empty/unknown Modul
+            modul = row.get("Modul", "").strip()
+            if not modul:
+                skipped += 1
+                logger.debug(
+                    "Skipping row %d in %s: empty Modul field", i, filename
+                )
+                continue
+
+            try:
+                fallgruppe = map_modul_to_fallgruppe(modul)
+            except ValueError:
+                skipped += 1
+                logger.warning(
+                    "Skipping row %d in %s: unmappable Modul '%s'",
+                    i,
+                    filename,
+                    modul,
+                )
+                continue
+
+            cases.append(
+                ParsedCase(
+                    nachname=kontakt["nachname"],
+                    vorname=kontakt["vorname"],
+                    geburtsdatum=kontakt["geburtsdatum"],
+                    kvnr=kontakt["kvnr"],
+                    thema=row.get("Thema", "").strip(),
+                    fallgruppe=fallgruppe,
+                    datum=datum,
+                    jahr=date_to_jahr(datum),
+                    kw=date_to_kw(datum),
+                    crm_ticket_id=row.get("Name", "").strip() or None,
+                )
+            )
+        except Exception as e:
+            errors.append(f"Row {i}: {e}")
+            logger.warning("CSV parse error in %s row %d: %s", filename, i, e)
+
+    logger.info(
+        "CSV parsing of '%s' complete: %d parsed, %d skipped, %d errors",
+        filename,
+        len(cases),
+        skipped,
+        len(errors),
+    )
+
+    return cases
--- a/backend/tests/test_csv_parser.py
+++ b/backend/tests/test_csv_parser.py
@ -0,0 +1,321 @@
+"""Tests for CRM CSV parser: synthetic and real-file tests."""
+
+import pathlib
+from datetime import date
+
+import pytest
+
+from app.services.csv_parser import ParsedCase, parse_csv, parse_hauptkontakt
+from app.utils.fallgruppe_map import VALID_FALLGRUPPEN
+
+DATA_DIR = pathlib.Path("/home/frontend/dak_c2s/data")
+
+
+# ── parse_hauptkontakt tests ─────────────────────────────────────────
+
+
+class TestParseHauptkontakt:
+    def test_full_contact(self):
+        """All four fields present and valid."""
+        result = parse_hauptkontakt("Tonn | Regina | 28.04.1960 | D410126355")
+        assert result["nachname"] == "Tonn"
+        assert result["vorname"] == "Regina"
+        assert result["geburtsdatum"] == date(1960, 4, 28)
+        assert result["kvnr"] == "D410126355"
+
+    def test_missing_kvnr(self):
+        """KVNR field empty (trailing pipe)."""
+        result = parse_hauptkontakt("Daum | Luana | 05.02.2016 |")
+        assert result["nachname"] == "Daum"
+        assert result["vorname"] == "Luana"
+        assert result["geburtsdatum"] == date(2016, 2, 5)
+        assert result["kvnr"] is None
+
+    def test_bad_date(self):
+        """Invalid date like 29.08.0196 -- geburtsdatum should be None."""
+        result = parse_hauptkontakt("Krölls | Peter | 29.08.0196 | S361390622")
+        assert result["nachname"] == "Krölls"
+        assert result["vorname"] == "Peter"
+        assert result["geburtsdatum"] is None  # bad date => None
+        assert result["kvnr"] == "S361390622"
+
+    def test_missing_geburtsdatum(self):
+        """Geburtsdatum field empty (spaces only)."""
+        result = parse_hauptkontakt("Schaumann | Janina |  |")
+        assert result["nachname"] == "Schaumann"
+        assert result["vorname"] == "Janina"
+        assert result["geburtsdatum"] is None
+        assert result["kvnr"] is None
+
+    def test_missing_geburtsdatum_with_kvnr(self):
+        """Geburtsdatum empty but KVNR present."""
+        result = parse_hauptkontakt("Schuber | Fritz |  | C352208902")
+        assert result["nachname"] == "Schuber"
+        assert result["vorname"] == "Fritz"
+        assert result["geburtsdatum"] is None
+        assert result["kvnr"] == "C352208902"
+
+    def test_missing_vorname(self):
+        """Vorname field empty."""
+        result = parse_hauptkontakt("Wuffy |  |  |")
+        assert result["nachname"] == "Wuffy"
+        assert result["vorname"] is None
+        assert result["geburtsdatum"] is None
+        assert result["kvnr"] is None
+
+    def test_whitespace_handling(self):
+        """Extra whitespace around pipes is stripped."""
+        result = parse_hauptkontakt("  Tonn  |  Regina  | 28.04.1960 | D410126355 ")
+        assert result["nachname"] == "Tonn"
+        assert result["vorname"] == "Regina"
+        assert result["kvnr"] == "D410126355"
+
+    def test_hyphenated_name(self):
+        """Hyphenated names are preserved."""
+        result = parse_hauptkontakt(
+            "Hähle-Jakelski | Rüdiger | 14.10.1941 | X304698107"
+        )
+        assert result["nachname"] == "Hähle-Jakelski"
+        assert result["vorname"] == "Rüdiger"
+
+
+# ── parse_csv synthetic tests ─────────────────────────────────────────
+
+
+def _make_csv_bytes(*rows: str, bom: bool = True) -> bytes:
+    """Helper: build CSV bytes from header + data rows."""
+    header = '"Hauptkontakt","Name","Thema","Erstellungsdatum","Modul"'
+    lines = [header] + list(rows)
+    text = "\n".join(lines)
+    prefix = b"\xef\xbb\xbf" if bom else b""
+    return prefix + text.encode("utf-8")
+
+
+class TestParseCSVSynthetic:
+    def test_basic_single_row(self):
+        """Parse a minimal CSV with one valid row."""
+        row = (
+            '"Tonn | Regina | 28.04.1960 | D410126355",'
+            '"103486",'
+            '"Zweitmeinung für Regina Tonn",'
+            '"02.02.26, 08:50",'
+            '"Zweitmeinung Kardiologie"'
+        )
+        cases = parse_csv(_make_csv_bytes(row))
+        assert len(cases) == 1
+        c = cases[0]
+        assert c.nachname == "Tonn"
+        assert c.vorname == "Regina"
+        assert c.geburtsdatum == date(1960, 4, 28)
+        assert c.kvnr == "D410126355"
+        assert c.thema == "Zweitmeinung für Regina Tonn"
+        assert c.fallgruppe == "kardio"
+        assert c.datum == date(2026, 2, 2)
+        assert c.jahr == 2026
+        assert c.kw == 6
+        assert c.crm_ticket_id == "103486"
+
+    def test_multiple_fallgruppen(self):
+        """Parse CSV with three different Fallgruppen."""
+        rows = [
+            (
+                '"A | B | 01.01.1990 | X123456789",'
+                '"100",'
+                '"Thema 1",'
+                '"02.02.26, 09:00",'
+                '"Zweitmeinung Onkologie"'
+            ),
+            (
+                '"C | D | 15.06.1985 | Y987654321",'
+                '"101",'
+                '"Thema 2",'
+                '"03.02.26, 10:00",'
+                '"Zweitmeinung Gallenblase"'
+            ),
+            (
+                '"E | F | 20.12.1970 |",'
+                '"102",'
+                '"Thema 3",'
+                '"04.02.26, 11:00",'
+                '"Zweitmeinung Schilddrüse"'
+            ),
+        ]
+        cases = parse_csv(_make_csv_bytes(*rows))
+        assert len(cases) == 3
+        assert cases[0].fallgruppe == "onko"
+        assert cases[1].fallgruppe == "galle"
+        assert cases[2].fallgruppe == "sd"
+        assert cases[2].kvnr is None
+
+    def test_empty_modul_skipped(self):
+        """Rows with empty Modul field are skipped (spam/junk entries)."""
+        rows = [
+            (
+                '"Tonn | Regina | 28.04.1960 | D410126355",'
+                '"103486",'
+                '"Thema",'
+                '"02.02.26, 08:50",'
+                '"Zweitmeinung Onkologie"'
+            ),
+            (
+                '"Wuffy |  |  |",'
+                '"103767",'
+                '"Spam",'
+                '"17.02.26, 17:16",'
+                '""'
+            ),
+        ]
+        cases = parse_csv(_make_csv_bytes(*rows))
+        assert len(cases) == 1
+        assert cases[0].nachname == "Tonn"
+
+    def test_bad_geburtsdatum_still_parses(self):
+        """Row with bad Geburtsdatum year is parsed, geburtsdatum=None."""
+        row = (
+            '"Krölls | Peter | 29.08.0196 | S361390622",'
+            '"103514",'
+            '"Zweitmeinung für Peter Krölls",'
+            '"04.02.26, 11:06",'
+            '"Zweitmeinung Onkologie"'
+        )
+        cases = parse_csv(_make_csv_bytes(row))
+        assert len(cases) == 1
+        assert cases[0].nachname == "Krölls"
+        assert cases[0].geburtsdatum is None
+        assert cases[0].kvnr == "S361390622"
+        assert cases[0].fallgruppe == "onko"
+
+    def test_without_bom(self):
+        """CSV without BOM is also parseable."""
+        row = (
+            '"A | B | 01.01.1990 | X123456789",'
+            '"100",'
+            '"Thema",'
+            '"02.02.26, 09:00",'
+            '"Zweitmeinung Onkologie"'
+        )
+        cases = parse_csv(_make_csv_bytes(row, bom=False))
+        assert len(cases) == 1
+
+    def test_empty_csv_returns_empty(self):
+        """CSV with only header returns empty list."""
+        cases = parse_csv(_make_csv_bytes())
+        assert cases == []
+
+    def test_crm_ticket_id_from_name_column(self):
+        """The 'Name' column contains the CRM ticket ID."""
+        row = (
+            '"A | B | 01.01.1990 |",'
+            '"999888",'
+            '"Thema",'
+            '"10.02.26, 10:00",'
+            '"Zweitmeinung Intensiv"'
+        )
+        cases = parse_csv(_make_csv_bytes(row))
+        assert cases[0].crm_ticket_id == "999888"
+        assert cases[0].fallgruppe == "intensiv"
+
+
+# ── Real CSV file tests ────────────────────────────────────────────────
+
+
+class TestParseRealCSV:
+    @pytest.mark.skipif(
+        not (DATA_DIR / "2026-02-06-0406.csv").exists(),
+        reason="Real CSV file not available",
+    )
+    def test_parse_real_csv_feb06(self):
+        """Parse the 2026-02-06-0406.csv real export."""
+        content = (DATA_DIR / "2026-02-06-0406.csv").read_bytes()
+        cases = parse_csv(content, filename="2026-02-06-0406.csv")
+
+        # Should have parsed rows (16 data rows, minus 0 with empty Modul)
+        assert len(cases) > 0
+        assert len(cases) == 16
+
+        # Every case must have a valid fallgruppe
+        for c in cases:
+            assert c.fallgruppe in VALID_FALLGRUPPEN, (
+                f"Invalid fallgruppe '{c.fallgruppe}' for {c.nachname}"
+            )
+            assert c.nachname  # nachname is always present
+            assert c.datum is not None
+            assert c.jahr >= 2026
+            assert 1 <= c.kw <= 53
+            assert c.crm_ticket_id is not None
+
+        # Spot-check first row
+        first = cases[0]
+        assert first.nachname == "Tonn"
+        assert first.vorname == "Regina"
+        assert first.kvnr == "D410126355"
+        assert first.fallgruppe == "kardio"
+        assert first.datum == date(2026, 2, 2)
+        assert first.kw == 6
+
+        # Spot-check edge case: missing KVNR
+        daum = [c for c in cases if c.nachname == "Daum"][0]
+        assert daum.kvnr is None
+        assert daum.fallgruppe == "intensiv"
+
+        # Spot-check edge case: bad Geburtsdatum
+        krolls = [c for c in cases if c.nachname == "Krölls"][0]
+        assert krolls.geburtsdatum is None
+        assert krolls.kvnr == "S361390622"
+
+        # Spot-check edge case: missing Geburtsdatum
+        schaumann = [c for c in cases if c.nachname == "Schaumann"][0]
+        assert schaumann.geburtsdatum is None
+        assert schaumann.kvnr is None
+
+    @pytest.mark.skipif(
+        not (DATA_DIR / "2026-02-17-1041.csv").exists(),
+        reason="Real CSV file not available",
+    )
+    def test_parse_real_csv_with_spam_rows(self):
+        """Parse file that contains rows with empty Modul (spam entries)."""
+        content = (DATA_DIR / "2026-02-17-1041.csv").read_bytes()
+        cases = parse_csv(content, filename="2026-02-17-1041.csv")
+
+        # File has 6 data rows, 2 with empty Modul => 4 valid cases
+        assert len(cases) == 4
+
+        # Spam entries (Wuffy, Apotheke) should NOT appear
+        names = [c.nachname for c in cases]
+        assert "Wuffy" not in names
+        assert "Apotheke" not in names
+
+        # All valid cases have a proper fallgruppe
+        for c in cases:
+            assert c.fallgruppe in VALID_FALLGRUPPEN
+
+    @pytest.mark.skipif(
+        not (DATA_DIR / "2026-02-17-0553.csv").exists(),
+        reason="Real CSV file not available",
+    )
+    def test_parse_real_csv_feb17(self):
+        """Parse the 2026-02-17-0553.csv real export."""
+        content = (DATA_DIR / "2026-02-17-0553.csv").read_bytes()
+        cases = parse_csv(content, filename="2026-02-17-0553.csv")
+
+        assert len(cases) > 0
+
+        # Spot-check: Fritz Schuber has empty Geburtsdatum but has KVNR
+        schuber = [c for c in cases if c.nachname == "Schuber"]
+        if schuber:
+            assert schuber[0].geburtsdatum is None
+            assert schuber[0].kvnr == "C352208902"
+
+    @pytest.mark.skipif(
+        not (DATA_DIR / "2026-02-23-0902.csv").exists(),
+        reason="Real CSV file not available",
+    )
+    def test_parse_real_csv_feb23(self):
+        """Parse the 2026-02-23-0902.csv real export."""
+        content = (DATA_DIR / "2026-02-23-0902.csv").read_bytes()
+        cases = parse_csv(content, filename="2026-02-23-0902.csv")
+
+        assert len(cases) > 0
+        for c in cases:
+            assert c.fallgruppe in VALID_FALLGRUPPEN
+            assert c.nachname