diff --git a/backend/app/services/csv_parser.py b/backend/app/services/csv_parser.py new file mode 100644 index 0000000..a121a76 --- /dev/null +++ b/backend/app/services/csv_parser.py @@ -0,0 +1,143 @@ +"""CRM CSV parser for DAK Zweitmeinungs-Portal. + +Parses CRM CSV exports with: +- UTF-8-BOM encoding +- Comma-delimited columns: Hauptkontakt, Name, Thema, Erstellungsdatum, Modul +- Pipe-delimited Hauptkontakt: "Nachname | Vorname | Geburtsdatum | KVNR" +- German date formats: DD.MM.YYYY (Geburtsdatum), DD.MM.YY, HH:MM (Erstellungsdatum) +- Modul-to-Fallgruppe mapping +""" + +import csv +import io +import logging +from dataclasses import dataclass +from datetime import date +from typing import Optional + +from app.utils.fallgruppe_map import map_modul_to_fallgruppe +from app.utils.kw_utils import date_to_jahr, date_to_kw, parse_german_date + +logger = logging.getLogger(__name__) + + +@dataclass +class ParsedCase: + """A single parsed case from a CRM CSV row.""" + + nachname: str + vorname: Optional[str] + geburtsdatum: Optional[date] + kvnr: Optional[str] + thema: str + fallgruppe: str + datum: date + jahr: int + kw: int + crm_ticket_id: Optional[str] + + +def parse_hauptkontakt(raw: str) -> dict: + """Parse pipe-delimited contact string. + + Format: "Nachname | Vorname | Geburtsdatum | KVNR" + + Edge cases handled: + - Missing KVNR: "Daum | Luana | 05.02.2016 |" + - Missing Geburtsdatum: "Schaumann | Janina | |" + - Bad date: "Krölls | Peter | 29.08.0196 | S361390622" + - Missing Vorname: "Wuffy | | |" + """ + parts = [p.strip() for p in raw.split("|")] + result = { + "nachname": parts[0] if len(parts) > 0 else "", + "vorname": parts[1] if len(parts) > 1 and parts[1] else None, + "geburtsdatum": None, + "kvnr": parts[3] if len(parts) > 3 and parts[3] else None, + } + if len(parts) > 2 and parts[2]: + try: + result["geburtsdatum"] = parse_german_date(parts[2]) + except (ValueError, Exception) as e: + logger.warning("Could not parse Geburtsdatum '%s': %s", parts[2], e) + return result + + +def parse_csv(content: bytes, filename: str = "") -> list[ParsedCase]: + """Parse CRM CSV file content into list of ParsedCase objects. + + Args: + content: Raw bytes of the CSV file (UTF-8-BOM encoded). + filename: Optional filename for logging context. + + Returns: + List of successfully parsed cases. Rows with empty/unmappable Modul + are skipped (logged as warnings). Other parse errors are also skipped + and logged. + """ + text = content.decode("utf-8-sig") # Handle BOM + reader = csv.DictReader(io.StringIO(text)) + cases: list[ParsedCase] = [] + errors: list[str] = [] + skipped = 0 + + for i, row in enumerate(reader, start=2): # row 1 is header + try: + # Parse pipe-delimited contact field + kontakt = parse_hauptkontakt(row.get("Hauptkontakt", "")) + + # Parse creation date + datum_str = row.get("Erstellungsdatum", "").strip() + if datum_str: + datum = parse_german_date(datum_str) + else: + datum = date.today() + + # Map Modul to Fallgruppe -- skip rows with empty/unknown Modul + modul = row.get("Modul", "").strip() + if not modul: + skipped += 1 + logger.debug( + "Skipping row %d in %s: empty Modul field", i, filename + ) + continue + + try: + fallgruppe = map_modul_to_fallgruppe(modul) + except ValueError: + skipped += 1 + logger.warning( + "Skipping row %d in %s: unmappable Modul '%s'", + i, + filename, + modul, + ) + continue + + cases.append( + ParsedCase( + nachname=kontakt["nachname"], + vorname=kontakt["vorname"], + geburtsdatum=kontakt["geburtsdatum"], + kvnr=kontakt["kvnr"], + thema=row.get("Thema", "").strip(), + fallgruppe=fallgruppe, + datum=datum, + jahr=date_to_jahr(datum), + kw=date_to_kw(datum), + crm_ticket_id=row.get("Name", "").strip() or None, + ) + ) + except Exception as e: + errors.append(f"Row {i}: {e}") + logger.warning("CSV parse error in %s row %d: %s", filename, i, e) + + logger.info( + "CSV parsing of '%s' complete: %d parsed, %d skipped, %d errors", + filename, + len(cases), + skipped, + len(errors), + ) + + return cases diff --git a/backend/tests/test_csv_parser.py b/backend/tests/test_csv_parser.py new file mode 100644 index 0000000..2ba57ad --- /dev/null +++ b/backend/tests/test_csv_parser.py @@ -0,0 +1,321 @@ +"""Tests for CRM CSV parser: synthetic and real-file tests.""" + +import pathlib +from datetime import date + +import pytest + +from app.services.csv_parser import ParsedCase, parse_csv, parse_hauptkontakt +from app.utils.fallgruppe_map import VALID_FALLGRUPPEN + +DATA_DIR = pathlib.Path("/home/frontend/dak_c2s/data") + + +# ── parse_hauptkontakt tests ───────────────────────────────────────── + + +class TestParseHauptkontakt: + def test_full_contact(self): + """All four fields present and valid.""" + result = parse_hauptkontakt("Tonn | Regina | 28.04.1960 | D410126355") + assert result["nachname"] == "Tonn" + assert result["vorname"] == "Regina" + assert result["geburtsdatum"] == date(1960, 4, 28) + assert result["kvnr"] == "D410126355" + + def test_missing_kvnr(self): + """KVNR field empty (trailing pipe).""" + result = parse_hauptkontakt("Daum | Luana | 05.02.2016 |") + assert result["nachname"] == "Daum" + assert result["vorname"] == "Luana" + assert result["geburtsdatum"] == date(2016, 2, 5) + assert result["kvnr"] is None + + def test_bad_date(self): + """Invalid date like 29.08.0196 -- geburtsdatum should be None.""" + result = parse_hauptkontakt("Krölls | Peter | 29.08.0196 | S361390622") + assert result["nachname"] == "Krölls" + assert result["vorname"] == "Peter" + assert result["geburtsdatum"] is None # bad date => None + assert result["kvnr"] == "S361390622" + + def test_missing_geburtsdatum(self): + """Geburtsdatum field empty (spaces only).""" + result = parse_hauptkontakt("Schaumann | Janina | |") + assert result["nachname"] == "Schaumann" + assert result["vorname"] == "Janina" + assert result["geburtsdatum"] is None + assert result["kvnr"] is None + + def test_missing_geburtsdatum_with_kvnr(self): + """Geburtsdatum empty but KVNR present.""" + result = parse_hauptkontakt("Schuber | Fritz | | C352208902") + assert result["nachname"] == "Schuber" + assert result["vorname"] == "Fritz" + assert result["geburtsdatum"] is None + assert result["kvnr"] == "C352208902" + + def test_missing_vorname(self): + """Vorname field empty.""" + result = parse_hauptkontakt("Wuffy | | |") + assert result["nachname"] == "Wuffy" + assert result["vorname"] is None + assert result["geburtsdatum"] is None + assert result["kvnr"] is None + + def test_whitespace_handling(self): + """Extra whitespace around pipes is stripped.""" + result = parse_hauptkontakt(" Tonn | Regina | 28.04.1960 | D410126355 ") + assert result["nachname"] == "Tonn" + assert result["vorname"] == "Regina" + assert result["kvnr"] == "D410126355" + + def test_hyphenated_name(self): + """Hyphenated names are preserved.""" + result = parse_hauptkontakt( + "Hähle-Jakelski | Rüdiger | 14.10.1941 | X304698107" + ) + assert result["nachname"] == "Hähle-Jakelski" + assert result["vorname"] == "Rüdiger" + + +# ── parse_csv synthetic tests ───────────────────────────────────────── + + +def _make_csv_bytes(*rows: str, bom: bool = True) -> bytes: + """Helper: build CSV bytes from header + data rows.""" + header = '"Hauptkontakt","Name","Thema","Erstellungsdatum","Modul"' + lines = [header] + list(rows) + text = "\n".join(lines) + prefix = b"\xef\xbb\xbf" if bom else b"" + return prefix + text.encode("utf-8") + + +class TestParseCSVSynthetic: + def test_basic_single_row(self): + """Parse a minimal CSV with one valid row.""" + row = ( + '"Tonn | Regina | 28.04.1960 | D410126355",' + '"103486",' + '"Zweitmeinung für Regina Tonn",' + '"02.02.26, 08:50",' + '"Zweitmeinung Kardiologie"' + ) + cases = parse_csv(_make_csv_bytes(row)) + assert len(cases) == 1 + c = cases[0] + assert c.nachname == "Tonn" + assert c.vorname == "Regina" + assert c.geburtsdatum == date(1960, 4, 28) + assert c.kvnr == "D410126355" + assert c.thema == "Zweitmeinung für Regina Tonn" + assert c.fallgruppe == "kardio" + assert c.datum == date(2026, 2, 2) + assert c.jahr == 2026 + assert c.kw == 6 + assert c.crm_ticket_id == "103486" + + def test_multiple_fallgruppen(self): + """Parse CSV with three different Fallgruppen.""" + rows = [ + ( + '"A | B | 01.01.1990 | X123456789",' + '"100",' + '"Thema 1",' + '"02.02.26, 09:00",' + '"Zweitmeinung Onkologie"' + ), + ( + '"C | D | 15.06.1985 | Y987654321",' + '"101",' + '"Thema 2",' + '"03.02.26, 10:00",' + '"Zweitmeinung Gallenblase"' + ), + ( + '"E | F | 20.12.1970 |",' + '"102",' + '"Thema 3",' + '"04.02.26, 11:00",' + '"Zweitmeinung Schilddrüse"' + ), + ] + cases = parse_csv(_make_csv_bytes(*rows)) + assert len(cases) == 3 + assert cases[0].fallgruppe == "onko" + assert cases[1].fallgruppe == "galle" + assert cases[2].fallgruppe == "sd" + assert cases[2].kvnr is None + + def test_empty_modul_skipped(self): + """Rows with empty Modul field are skipped (spam/junk entries).""" + rows = [ + ( + '"Tonn | Regina | 28.04.1960 | D410126355",' + '"103486",' + '"Thema",' + '"02.02.26, 08:50",' + '"Zweitmeinung Onkologie"' + ), + ( + '"Wuffy | | |",' + '"103767",' + '"Spam",' + '"17.02.26, 17:16",' + '""' + ), + ] + cases = parse_csv(_make_csv_bytes(*rows)) + assert len(cases) == 1 + assert cases[0].nachname == "Tonn" + + def test_bad_geburtsdatum_still_parses(self): + """Row with bad Geburtsdatum year is parsed, geburtsdatum=None.""" + row = ( + '"Krölls | Peter | 29.08.0196 | S361390622",' + '"103514",' + '"Zweitmeinung für Peter Krölls",' + '"04.02.26, 11:06",' + '"Zweitmeinung Onkologie"' + ) + cases = parse_csv(_make_csv_bytes(row)) + assert len(cases) == 1 + assert cases[0].nachname == "Krölls" + assert cases[0].geburtsdatum is None + assert cases[0].kvnr == "S361390622" + assert cases[0].fallgruppe == "onko" + + def test_without_bom(self): + """CSV without BOM is also parseable.""" + row = ( + '"A | B | 01.01.1990 | X123456789",' + '"100",' + '"Thema",' + '"02.02.26, 09:00",' + '"Zweitmeinung Onkologie"' + ) + cases = parse_csv(_make_csv_bytes(row, bom=False)) + assert len(cases) == 1 + + def test_empty_csv_returns_empty(self): + """CSV with only header returns empty list.""" + cases = parse_csv(_make_csv_bytes()) + assert cases == [] + + def test_crm_ticket_id_from_name_column(self): + """The 'Name' column contains the CRM ticket ID.""" + row = ( + '"A | B | 01.01.1990 |",' + '"999888",' + '"Thema",' + '"10.02.26, 10:00",' + '"Zweitmeinung Intensiv"' + ) + cases = parse_csv(_make_csv_bytes(row)) + assert cases[0].crm_ticket_id == "999888" + assert cases[0].fallgruppe == "intensiv" + + +# ── Real CSV file tests ──────────────────────────────────────────────── + + +class TestParseRealCSV: + @pytest.mark.skipif( + not (DATA_DIR / "2026-02-06-0406.csv").exists(), + reason="Real CSV file not available", + ) + def test_parse_real_csv_feb06(self): + """Parse the 2026-02-06-0406.csv real export.""" + content = (DATA_DIR / "2026-02-06-0406.csv").read_bytes() + cases = parse_csv(content, filename="2026-02-06-0406.csv") + + # Should have parsed rows (16 data rows, minus 0 with empty Modul) + assert len(cases) > 0 + assert len(cases) == 16 + + # Every case must have a valid fallgruppe + for c in cases: + assert c.fallgruppe in VALID_FALLGRUPPEN, ( + f"Invalid fallgruppe '{c.fallgruppe}' for {c.nachname}" + ) + assert c.nachname # nachname is always present + assert c.datum is not None + assert c.jahr >= 2026 + assert 1 <= c.kw <= 53 + assert c.crm_ticket_id is not None + + # Spot-check first row + first = cases[0] + assert first.nachname == "Tonn" + assert first.vorname == "Regina" + assert first.kvnr == "D410126355" + assert first.fallgruppe == "kardio" + assert first.datum == date(2026, 2, 2) + assert first.kw == 6 + + # Spot-check edge case: missing KVNR + daum = [c for c in cases if c.nachname == "Daum"][0] + assert daum.kvnr is None + assert daum.fallgruppe == "intensiv" + + # Spot-check edge case: bad Geburtsdatum + krolls = [c for c in cases if c.nachname == "Krölls"][0] + assert krolls.geburtsdatum is None + assert krolls.kvnr == "S361390622" + + # Spot-check edge case: missing Geburtsdatum + schaumann = [c for c in cases if c.nachname == "Schaumann"][0] + assert schaumann.geburtsdatum is None + assert schaumann.kvnr is None + + @pytest.mark.skipif( + not (DATA_DIR / "2026-02-17-1041.csv").exists(), + reason="Real CSV file not available", + ) + def test_parse_real_csv_with_spam_rows(self): + """Parse file that contains rows with empty Modul (spam entries).""" + content = (DATA_DIR / "2026-02-17-1041.csv").read_bytes() + cases = parse_csv(content, filename="2026-02-17-1041.csv") + + # File has 6 data rows, 2 with empty Modul => 4 valid cases + assert len(cases) == 4 + + # Spam entries (Wuffy, Apotheke) should NOT appear + names = [c.nachname for c in cases] + assert "Wuffy" not in names + assert "Apotheke" not in names + + # All valid cases have a proper fallgruppe + for c in cases: + assert c.fallgruppe in VALID_FALLGRUPPEN + + @pytest.mark.skipif( + not (DATA_DIR / "2026-02-17-0553.csv").exists(), + reason="Real CSV file not available", + ) + def test_parse_real_csv_feb17(self): + """Parse the 2026-02-17-0553.csv real export.""" + content = (DATA_DIR / "2026-02-17-0553.csv").read_bytes() + cases = parse_csv(content, filename="2026-02-17-0553.csv") + + assert len(cases) > 0 + + # Spot-check: Fritz Schuber has empty Geburtsdatum but has KVNR + schuber = [c for c in cases if c.nachname == "Schuber"] + if schuber: + assert schuber[0].geburtsdatum is None + assert schuber[0].kvnr == "C352208902" + + @pytest.mark.skipif( + not (DATA_DIR / "2026-02-23-0902.csv").exists(), + reason="Real CSV file not available", + ) + def test_parse_real_csv_feb23(self): + """Parse the 2026-02-23-0902.csv real export.""" + content = (DATA_DIR / "2026-02-23-0902.csv").read_bytes() + cases = parse_csv(content, filename="2026-02-23-0902.csv") + + assert len(cases) > 0 + for c in cases: + assert c.fallgruppe in VALID_FALLGRUPPEN + assert c.nachname