dak.c2s/backend/tests/test_csv_parser.py
CCS Admin 84d11822e0 feat: CRM CSV parser with pipe-delimited contact parsing
Parse CRM CSV exports (UTF-8-BOM, comma-delimited) with:
- Pipe-delimited Hauptkontakt field (Nachname|Vorname|Geburtsdatum|KVNR)
- German date formats (DD.MM.YYYY, DD.MM.YY, HH:MM)
- Modul-to-Fallgruppe mapping
- Graceful handling of missing KVNR, bad dates, empty fields, spam rows
- 19 tests (synthetic + all 4 real CSV files)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 07:45:34 +00:00

321 lines
12 KiB
Python

"""Tests for CRM CSV parser: synthetic and real-file tests."""
import pathlib
from datetime import date
import pytest
from app.services.csv_parser import ParsedCase, parse_csv, parse_hauptkontakt
from app.utils.fallgruppe_map import VALID_FALLGRUPPEN
DATA_DIR = pathlib.Path("/home/frontend/dak_c2s/data")
# ── parse_hauptkontakt tests ─────────────────────────────────────────
class TestParseHauptkontakt:
def test_full_contact(self):
"""All four fields present and valid."""
result = parse_hauptkontakt("Tonn | Regina | 28.04.1960 | D410126355")
assert result["nachname"] == "Tonn"
assert result["vorname"] == "Regina"
assert result["geburtsdatum"] == date(1960, 4, 28)
assert result["kvnr"] == "D410126355"
def test_missing_kvnr(self):
"""KVNR field empty (trailing pipe)."""
result = parse_hauptkontakt("Daum | Luana | 05.02.2016 |")
assert result["nachname"] == "Daum"
assert result["vorname"] == "Luana"
assert result["geburtsdatum"] == date(2016, 2, 5)
assert result["kvnr"] is None
def test_bad_date(self):
"""Invalid date like 29.08.0196 -- geburtsdatum should be None."""
result = parse_hauptkontakt("Krölls | Peter | 29.08.0196 | S361390622")
assert result["nachname"] == "Krölls"
assert result["vorname"] == "Peter"
assert result["geburtsdatum"] is None # bad date => None
assert result["kvnr"] == "S361390622"
def test_missing_geburtsdatum(self):
"""Geburtsdatum field empty (spaces only)."""
result = parse_hauptkontakt("Schaumann | Janina | |")
assert result["nachname"] == "Schaumann"
assert result["vorname"] == "Janina"
assert result["geburtsdatum"] is None
assert result["kvnr"] is None
def test_missing_geburtsdatum_with_kvnr(self):
"""Geburtsdatum empty but KVNR present."""
result = parse_hauptkontakt("Schuber | Fritz | | C352208902")
assert result["nachname"] == "Schuber"
assert result["vorname"] == "Fritz"
assert result["geburtsdatum"] is None
assert result["kvnr"] == "C352208902"
def test_missing_vorname(self):
"""Vorname field empty."""
result = parse_hauptkontakt("Wuffy | | |")
assert result["nachname"] == "Wuffy"
assert result["vorname"] is None
assert result["geburtsdatum"] is None
assert result["kvnr"] is None
def test_whitespace_handling(self):
"""Extra whitespace around pipes is stripped."""
result = parse_hauptkontakt(" Tonn | Regina | 28.04.1960 | D410126355 ")
assert result["nachname"] == "Tonn"
assert result["vorname"] == "Regina"
assert result["kvnr"] == "D410126355"
def test_hyphenated_name(self):
"""Hyphenated names are preserved."""
result = parse_hauptkontakt(
"Hähle-Jakelski | Rüdiger | 14.10.1941 | X304698107"
)
assert result["nachname"] == "Hähle-Jakelski"
assert result["vorname"] == "Rüdiger"
# ── parse_csv synthetic tests ─────────────────────────────────────────
def _make_csv_bytes(*rows: str, bom: bool = True) -> bytes:
"""Helper: build CSV bytes from header + data rows."""
header = '"Hauptkontakt","Name","Thema","Erstellungsdatum","Modul"'
lines = [header] + list(rows)
text = "\n".join(lines)
prefix = b"\xef\xbb\xbf" if bom else b""
return prefix + text.encode("utf-8")
class TestParseCSVSynthetic:
def test_basic_single_row(self):
"""Parse a minimal CSV with one valid row."""
row = (
'"Tonn | Regina | 28.04.1960 | D410126355",'
'"103486",'
'"Zweitmeinung für Regina Tonn",'
'"02.02.26, 08:50",'
'"Zweitmeinung Kardiologie"'
)
cases = parse_csv(_make_csv_bytes(row))
assert len(cases) == 1
c = cases[0]
assert c.nachname == "Tonn"
assert c.vorname == "Regina"
assert c.geburtsdatum == date(1960, 4, 28)
assert c.kvnr == "D410126355"
assert c.thema == "Zweitmeinung für Regina Tonn"
assert c.fallgruppe == "kardio"
assert c.datum == date(2026, 2, 2)
assert c.jahr == 2026
assert c.kw == 6
assert c.crm_ticket_id == "103486"
def test_multiple_fallgruppen(self):
"""Parse CSV with three different Fallgruppen."""
rows = [
(
'"A | B | 01.01.1990 | X123456789",'
'"100",'
'"Thema 1",'
'"02.02.26, 09:00",'
'"Zweitmeinung Onkologie"'
),
(
'"C | D | 15.06.1985 | Y987654321",'
'"101",'
'"Thema 2",'
'"03.02.26, 10:00",'
'"Zweitmeinung Gallenblase"'
),
(
'"E | F | 20.12.1970 |",'
'"102",'
'"Thema 3",'
'"04.02.26, 11:00",'
'"Zweitmeinung Schilddrüse"'
),
]
cases = parse_csv(_make_csv_bytes(*rows))
assert len(cases) == 3
assert cases[0].fallgruppe == "onko"
assert cases[1].fallgruppe == "galle"
assert cases[2].fallgruppe == "sd"
assert cases[2].kvnr is None
def test_empty_modul_skipped(self):
"""Rows with empty Modul field are skipped (spam/junk entries)."""
rows = [
(
'"Tonn | Regina | 28.04.1960 | D410126355",'
'"103486",'
'"Thema",'
'"02.02.26, 08:50",'
'"Zweitmeinung Onkologie"'
),
(
'"Wuffy | | |",'
'"103767",'
'"Spam",'
'"17.02.26, 17:16",'
'""'
),
]
cases = parse_csv(_make_csv_bytes(*rows))
assert len(cases) == 1
assert cases[0].nachname == "Tonn"
def test_bad_geburtsdatum_still_parses(self):
"""Row with bad Geburtsdatum year is parsed, geburtsdatum=None."""
row = (
'"Krölls | Peter | 29.08.0196 | S361390622",'
'"103514",'
'"Zweitmeinung für Peter Krölls",'
'"04.02.26, 11:06",'
'"Zweitmeinung Onkologie"'
)
cases = parse_csv(_make_csv_bytes(row))
assert len(cases) == 1
assert cases[0].nachname == "Krölls"
assert cases[0].geburtsdatum is None
assert cases[0].kvnr == "S361390622"
assert cases[0].fallgruppe == "onko"
def test_without_bom(self):
"""CSV without BOM is also parseable."""
row = (
'"A | B | 01.01.1990 | X123456789",'
'"100",'
'"Thema",'
'"02.02.26, 09:00",'
'"Zweitmeinung Onkologie"'
)
cases = parse_csv(_make_csv_bytes(row, bom=False))
assert len(cases) == 1
def test_empty_csv_returns_empty(self):
"""CSV with only header returns empty list."""
cases = parse_csv(_make_csv_bytes())
assert cases == []
def test_crm_ticket_id_from_name_column(self):
"""The 'Name' column contains the CRM ticket ID."""
row = (
'"A | B | 01.01.1990 |",'
'"999888",'
'"Thema",'
'"10.02.26, 10:00",'
'"Zweitmeinung Intensiv"'
)
cases = parse_csv(_make_csv_bytes(row))
assert cases[0].crm_ticket_id == "999888"
assert cases[0].fallgruppe == "intensiv"
# ── Real CSV file tests ────────────────────────────────────────────────
class TestParseRealCSV:
@pytest.mark.skipif(
not (DATA_DIR / "2026-02-06-0406.csv").exists(),
reason="Real CSV file not available",
)
def test_parse_real_csv_feb06(self):
"""Parse the 2026-02-06-0406.csv real export."""
content = (DATA_DIR / "2026-02-06-0406.csv").read_bytes()
cases = parse_csv(content, filename="2026-02-06-0406.csv")
# Should have parsed rows (16 data rows, minus 0 with empty Modul)
assert len(cases) > 0
assert len(cases) == 16
# Every case must have a valid fallgruppe
for c in cases:
assert c.fallgruppe in VALID_FALLGRUPPEN, (
f"Invalid fallgruppe '{c.fallgruppe}' for {c.nachname}"
)
assert c.nachname # nachname is always present
assert c.datum is not None
assert c.jahr >= 2026
assert 1 <= c.kw <= 53
assert c.crm_ticket_id is not None
# Spot-check first row
first = cases[0]
assert first.nachname == "Tonn"
assert first.vorname == "Regina"
assert first.kvnr == "D410126355"
assert first.fallgruppe == "kardio"
assert first.datum == date(2026, 2, 2)
assert first.kw == 6
# Spot-check edge case: missing KVNR
daum = [c for c in cases if c.nachname == "Daum"][0]
assert daum.kvnr is None
assert daum.fallgruppe == "intensiv"
# Spot-check edge case: bad Geburtsdatum
krolls = [c for c in cases if c.nachname == "Krölls"][0]
assert krolls.geburtsdatum is None
assert krolls.kvnr == "S361390622"
# Spot-check edge case: missing Geburtsdatum
schaumann = [c for c in cases if c.nachname == "Schaumann"][0]
assert schaumann.geburtsdatum is None
assert schaumann.kvnr is None
@pytest.mark.skipif(
not (DATA_DIR / "2026-02-17-1041.csv").exists(),
reason="Real CSV file not available",
)
def test_parse_real_csv_with_spam_rows(self):
"""Parse file that contains rows with empty Modul (spam entries)."""
content = (DATA_DIR / "2026-02-17-1041.csv").read_bytes()
cases = parse_csv(content, filename="2026-02-17-1041.csv")
# File has 6 data rows, 2 with empty Modul => 4 valid cases
assert len(cases) == 4
# Spam entries (Wuffy, Apotheke) should NOT appear
names = [c.nachname for c in cases]
assert "Wuffy" not in names
assert "Apotheke" not in names
# All valid cases have a proper fallgruppe
for c in cases:
assert c.fallgruppe in VALID_FALLGRUPPEN
@pytest.mark.skipif(
not (DATA_DIR / "2026-02-17-0553.csv").exists(),
reason="Real CSV file not available",
)
def test_parse_real_csv_feb17(self):
"""Parse the 2026-02-17-0553.csv real export."""
content = (DATA_DIR / "2026-02-17-0553.csv").read_bytes()
cases = parse_csv(content, filename="2026-02-17-0553.csv")
assert len(cases) > 0
# Spot-check: Fritz Schuber has empty Geburtsdatum but has KVNR
schuber = [c for c in cases if c.nachname == "Schuber"]
if schuber:
assert schuber[0].geburtsdatum is None
assert schuber[0].kvnr == "C352208902"
@pytest.mark.skipif(
not (DATA_DIR / "2026-02-23-0902.csv").exists(),
reason="Real CSV file not available",
)
def test_parse_real_csv_feb23(self):
"""Parse the 2026-02-23-0902.csv real export."""
content = (DATA_DIR / "2026-02-23-0902.csv").read_bytes()
cases = parse_csv(content, filename="2026-02-23-0902.csv")
assert len(cases) > 0
for c in cases:
assert c.fallgruppe in VALID_FALLGRUPPEN
assert c.nachname