dak.c2s/backend/tests/test_csv_parser.py

"""Tests for CRM CSV parser: synthetic and real-file tests."""

import pathlib
from datetime import date

import pytest

from app.services.csv_parser import ParsedCase, parse_csv, parse_hauptkontakt
from app.utils.fallgruppe_map import VALID_FALLGRUPPEN

DATA_DIR = pathlib.Path("/home/frontend/dak_c2s/data")


# ── parse_hauptkontakt tests ─────────────────────────────────────────


class TestParseHauptkontakt:
    def test_full_contact(self):
        """All four fields present and valid."""
        result = parse_hauptkontakt("Tonn | Regina | 28.04.1960 | D410126355")
        assert result["nachname"] == "Tonn"
        assert result["vorname"] == "Regina"
        assert result["geburtsdatum"] == date(1960, 4, 28)
        assert result["kvnr"] == "D410126355"

    def test_missing_kvnr(self):
        """KVNR field empty (trailing pipe)."""
        result = parse_hauptkontakt("Daum | Luana | 05.02.2016 |")
        assert result["nachname"] == "Daum"
        assert result["vorname"] == "Luana"
        assert result["geburtsdatum"] == date(2016, 2, 5)
        assert result["kvnr"] is None

    def test_bad_date(self):
        """Invalid date like 29.08.0196 -- geburtsdatum should be None."""
        result = parse_hauptkontakt("Krölls | Peter | 29.08.0196 | S361390622")
        assert result["nachname"] == "Krölls"
        assert result["vorname"] == "Peter"
        assert result["geburtsdatum"] is None  # bad date => None
        assert result["kvnr"] == "S361390622"

    def test_missing_geburtsdatum(self):
        """Geburtsdatum field empty (spaces only)."""
        result = parse_hauptkontakt("Schaumann | Janina |  |")
        assert result["nachname"] == "Schaumann"
        assert result["vorname"] == "Janina"
        assert result["geburtsdatum"] is None
        assert result["kvnr"] is None

    def test_missing_geburtsdatum_with_kvnr(self):
        """Geburtsdatum empty but KVNR present."""
        result = parse_hauptkontakt("Schuber | Fritz |  | C352208902")
        assert result["nachname"] == "Schuber"
        assert result["vorname"] == "Fritz"
        assert result["geburtsdatum"] is None
        assert result["kvnr"] == "C352208902"

    def test_missing_vorname(self):
        """Vorname field empty."""
        result = parse_hauptkontakt("Wuffy |  |  |")
        assert result["nachname"] == "Wuffy"
        assert result["vorname"] is None
        assert result["geburtsdatum"] is None
        assert result["kvnr"] is None

    def test_whitespace_handling(self):
        """Extra whitespace around pipes is stripped."""
        result = parse_hauptkontakt("  Tonn  |  Regina  | 28.04.1960 | D410126355 ")
        assert result["nachname"] == "Tonn"
        assert result["vorname"] == "Regina"
        assert result["kvnr"] == "D410126355"

    def test_hyphenated_name(self):
        """Hyphenated names are preserved."""
        result = parse_hauptkontakt(
            "Hähle-Jakelski | Rüdiger | 14.10.1941 | X304698107"
        )
        assert result["nachname"] == "Hähle-Jakelski"
        assert result["vorname"] == "Rüdiger"


# ── parse_csv synthetic tests ─────────────────────────────────────────


def _make_csv_bytes(*rows: str, bom: bool = True) -> bytes:
    """Helper: build CSV bytes from header + data rows."""
    header = '"Hauptkontakt","Name","Thema","Erstellungsdatum","Modul"'
    lines = [header] + list(rows)
    text = "\n".join(lines)
    prefix = b"\xef\xbb\xbf" if bom else b""
    return prefix + text.encode("utf-8")


class TestParseCSVSynthetic:
    def test_basic_single_row(self):
        """Parse a minimal CSV with one valid row."""
        row = (
            '"Tonn | Regina | 28.04.1960 | D410126355",'
            '"103486",'
            '"Zweitmeinung für Regina Tonn",'
            '"02.02.26, 08:50",'
            '"Zweitmeinung Kardiologie"'
        )
        cases = parse_csv(_make_csv_bytes(row))
        assert len(cases) == 1
        c = cases[0]
        assert c.nachname == "Tonn"
        assert c.vorname == "Regina"
        assert c.geburtsdatum == date(1960, 4, 28)
        assert c.kvnr == "D410126355"
        assert c.thema == "Zweitmeinung für Regina Tonn"
        assert c.fallgruppe == "kardio"
        assert c.datum == date(2026, 2, 2)
        assert c.jahr == 2026
        assert c.kw == 6
        assert c.crm_ticket_id == "103486"

    def test_multiple_fallgruppen(self):
        """Parse CSV with three different Fallgruppen."""
        rows = [
            (
                '"A | B | 01.01.1990 | X123456789",'
                '"100",'
                '"Thema 1",'
                '"02.02.26, 09:00",'
                '"Zweitmeinung Onkologie"'
            ),
            (
                '"C | D | 15.06.1985 | Y987654321",'
                '"101",'
                '"Thema 2",'
                '"03.02.26, 10:00",'
                '"Zweitmeinung Gallenblase"'
            ),
            (
                '"E | F | 20.12.1970 |",'
                '"102",'
                '"Thema 3",'
                '"04.02.26, 11:00",'
                '"Zweitmeinung Schilddrüse"'
            ),
        ]
        cases = parse_csv(_make_csv_bytes(*rows))
        assert len(cases) == 3
        assert cases[0].fallgruppe == "onko"
        assert cases[1].fallgruppe == "galle"
        assert cases[2].fallgruppe == "sd"
        assert cases[2].kvnr is None

    def test_empty_modul_skipped(self):
        """Rows with empty Modul field are skipped (spam/junk entries)."""
        rows = [
            (
                '"Tonn | Regina | 28.04.1960 | D410126355",'
                '"103486",'
                '"Thema",'
                '"02.02.26, 08:50",'
                '"Zweitmeinung Onkologie"'
            ),
            (
                '"Wuffy |  |  |",'
                '"103767",'
                '"Spam",'
                '"17.02.26, 17:16",'
                '""'
            ),
        ]
        cases = parse_csv(_make_csv_bytes(*rows))
        assert len(cases) == 1
        assert cases[0].nachname == "Tonn"

    def test_bad_geburtsdatum_still_parses(self):
        """Row with bad Geburtsdatum year is parsed, geburtsdatum=None."""
        row = (
            '"Krölls | Peter | 29.08.0196 | S361390622",'
            '"103514",'
            '"Zweitmeinung für Peter Krölls",'
            '"04.02.26, 11:06",'
            '"Zweitmeinung Onkologie"'
        )
        cases = parse_csv(_make_csv_bytes(row))
        assert len(cases) == 1
        assert cases[0].nachname == "Krölls"
        assert cases[0].geburtsdatum is None
        assert cases[0].kvnr == "S361390622"
        assert cases[0].fallgruppe == "onko"

    def test_without_bom(self):
        """CSV without BOM is also parseable."""
        row = (
            '"A | B | 01.01.1990 | X123456789",'
            '"100",'
            '"Thema",'
            '"02.02.26, 09:00",'
            '"Zweitmeinung Onkologie"'
        )
        cases = parse_csv(_make_csv_bytes(row, bom=False))
        assert len(cases) == 1

    def test_empty_csv_returns_empty(self):
        """CSV with only header returns empty list."""
        cases = parse_csv(_make_csv_bytes())
        assert cases == []

    def test_crm_ticket_id_from_name_column(self):
        """The 'Name' column contains the CRM ticket ID."""
        row = (
            '"A | B | 01.01.1990 |",'
            '"999888",'
            '"Thema",'
            '"10.02.26, 10:00",'
            '"Zweitmeinung Intensiv"'
        )
        cases = parse_csv(_make_csv_bytes(row))
        assert cases[0].crm_ticket_id == "999888"
        assert cases[0].fallgruppe == "intensiv"


# ── Real CSV file tests ────────────────────────────────────────────────


class TestParseRealCSV:
    @pytest.mark.skipif(
        not (DATA_DIR / "2026-02-06-0406.csv").exists(),
        reason="Real CSV file not available",
    )
    def test_parse_real_csv_feb06(self):
        """Parse the 2026-02-06-0406.csv real export."""
        content = (DATA_DIR / "2026-02-06-0406.csv").read_bytes()
        cases = parse_csv(content, filename="2026-02-06-0406.csv")

        # Should have parsed rows (16 data rows, minus 0 with empty Modul)
        assert len(cases) > 0
        assert len(cases) == 16

        # Every case must have a valid fallgruppe
        for c in cases:
            assert c.fallgruppe in VALID_FALLGRUPPEN, (
                f"Invalid fallgruppe '{c.fallgruppe}' for {c.nachname}"
            )
            assert c.nachname  # nachname is always present
            assert c.datum is not None
            assert c.jahr >= 2026
            assert 1 <= c.kw <= 53
            assert c.crm_ticket_id is not None

        # Spot-check first row
        first = cases[0]
        assert first.nachname == "Tonn"
        assert first.vorname == "Regina"
        assert first.kvnr == "D410126355"
        assert first.fallgruppe == "kardio"
        assert first.datum == date(2026, 2, 2)
        assert first.kw == 6

        # Spot-check edge case: missing KVNR
        daum = [c for c in cases if c.nachname == "Daum"][0]
        assert daum.kvnr is None
        assert daum.fallgruppe == "intensiv"

        # Spot-check edge case: bad Geburtsdatum
        krolls = [c for c in cases if c.nachname == "Krölls"][0]
        assert krolls.geburtsdatum is None
        assert krolls.kvnr == "S361390622"

        # Spot-check edge case: missing Geburtsdatum
        schaumann = [c for c in cases if c.nachname == "Schaumann"][0]
        assert schaumann.geburtsdatum is None
        assert schaumann.kvnr is None

    @pytest.mark.skipif(
        not (DATA_DIR / "2026-02-17-1041.csv").exists(),
        reason="Real CSV file not available",
    )
    def test_parse_real_csv_with_spam_rows(self):
        """Parse file that contains rows with empty Modul (spam entries)."""
        content = (DATA_DIR / "2026-02-17-1041.csv").read_bytes()
        cases = parse_csv(content, filename="2026-02-17-1041.csv")

        # File has 6 data rows, 2 with empty Modul => 4 valid cases
        assert len(cases) == 4

        # Spam entries (Wuffy, Apotheke) should NOT appear
        names = [c.nachname for c in cases]
        assert "Wuffy" not in names
        assert "Apotheke" not in names

        # All valid cases have a proper fallgruppe
        for c in cases:
            assert c.fallgruppe in VALID_FALLGRUPPEN

    @pytest.mark.skipif(
        not (DATA_DIR / "2026-02-17-0553.csv").exists(),
        reason="Real CSV file not available",
    )
    def test_parse_real_csv_feb17(self):
        """Parse the 2026-02-17-0553.csv real export."""
        content = (DATA_DIR / "2026-02-17-0553.csv").read_bytes()
        cases = parse_csv(content, filename="2026-02-17-0553.csv")

        assert len(cases) > 0

        # Spot-check: Fritz Schuber has empty Geburtsdatum but has KVNR
        schuber = [c for c in cases if c.nachname == "Schuber"]
        if schuber:
            assert schuber[0].geburtsdatum is None
            assert schuber[0].kvnr == "C352208902"

    @pytest.mark.skipif(
        not (DATA_DIR / "2026-02-23-0902.csv").exists(),
        reason="Real CSV file not available",
    )
    def test_parse_real_csv_feb23(self):
        """Parse the 2026-02-23-0902.csv real export."""
        content = (DATA_DIR / "2026-02-23-0902.csv").read_bytes()
        cases = parse_csv(content, filename="2026-02-23-0902.csv")

        assert len(cases) > 0
        for c in cases:
            assert c.fallgruppe in VALID_FALLGRUPPEN
            assert c.nachname