dak.c2s/backend/app/services/csv_parser.py
CCS Admin 84d11822e0 feat: CRM CSV parser with pipe-delimited contact parsing
Parse CRM CSV exports (UTF-8-BOM, comma-delimited) with:
- Pipe-delimited Hauptkontakt field (Nachname|Vorname|Geburtsdatum|KVNR)
- German date formats (DD.MM.YYYY, DD.MM.YY, HH:MM)
- Modul-to-Fallgruppe mapping
- Graceful handling of missing KVNR, bad dates, empty fields, spam rows
- 19 tests (synthetic + all 4 real CSV files)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 07:45:34 +00:00

143 lines
4.5 KiB
Python

"""CRM CSV parser for DAK Zweitmeinungs-Portal.
Parses CRM CSV exports with:
- UTF-8-BOM encoding
- Comma-delimited columns: Hauptkontakt, Name, Thema, Erstellungsdatum, Modul
- Pipe-delimited Hauptkontakt: "Nachname | Vorname | Geburtsdatum | KVNR"
- German date formats: DD.MM.YYYY (Geburtsdatum), DD.MM.YY, HH:MM (Erstellungsdatum)
- Modul-to-Fallgruppe mapping
"""
import csv
import io
import logging
from dataclasses import dataclass
from datetime import date
from typing import Optional
from app.utils.fallgruppe_map import map_modul_to_fallgruppe
from app.utils.kw_utils import date_to_jahr, date_to_kw, parse_german_date
logger = logging.getLogger(__name__)
@dataclass
class ParsedCase:
"""A single parsed case from a CRM CSV row."""
nachname: str
vorname: Optional[str]
geburtsdatum: Optional[date]
kvnr: Optional[str]
thema: str
fallgruppe: str
datum: date
jahr: int
kw: int
crm_ticket_id: Optional[str]
def parse_hauptkontakt(raw: str) -> dict:
"""Parse pipe-delimited contact string.
Format: "Nachname | Vorname | Geburtsdatum | KVNR"
Edge cases handled:
- Missing KVNR: "Daum | Luana | 05.02.2016 |"
- Missing Geburtsdatum: "Schaumann | Janina | |"
- Bad date: "Krölls | Peter | 29.08.0196 | S361390622"
- Missing Vorname: "Wuffy | | |"
"""
parts = [p.strip() for p in raw.split("|")]
result = {
"nachname": parts[0] if len(parts) > 0 else "",
"vorname": parts[1] if len(parts) > 1 and parts[1] else None,
"geburtsdatum": None,
"kvnr": parts[3] if len(parts) > 3 and parts[3] else None,
}
if len(parts) > 2 and parts[2]:
try:
result["geburtsdatum"] = parse_german_date(parts[2])
except (ValueError, Exception) as e:
logger.warning("Could not parse Geburtsdatum '%s': %s", parts[2], e)
return result
def parse_csv(content: bytes, filename: str = "") -> list[ParsedCase]:
"""Parse CRM CSV file content into list of ParsedCase objects.
Args:
content: Raw bytes of the CSV file (UTF-8-BOM encoded).
filename: Optional filename for logging context.
Returns:
List of successfully parsed cases. Rows with empty/unmappable Modul
are skipped (logged as warnings). Other parse errors are also skipped
and logged.
"""
text = content.decode("utf-8-sig") # Handle BOM
reader = csv.DictReader(io.StringIO(text))
cases: list[ParsedCase] = []
errors: list[str] = []
skipped = 0
for i, row in enumerate(reader, start=2): # row 1 is header
try:
# Parse pipe-delimited contact field
kontakt = parse_hauptkontakt(row.get("Hauptkontakt", ""))
# Parse creation date
datum_str = row.get("Erstellungsdatum", "").strip()
if datum_str:
datum = parse_german_date(datum_str)
else:
datum = date.today()
# Map Modul to Fallgruppe -- skip rows with empty/unknown Modul
modul = row.get("Modul", "").strip()
if not modul:
skipped += 1
logger.debug(
"Skipping row %d in %s: empty Modul field", i, filename
)
continue
try:
fallgruppe = map_modul_to_fallgruppe(modul)
except ValueError:
skipped += 1
logger.warning(
"Skipping row %d in %s: unmappable Modul '%s'",
i,
filename,
modul,
)
continue
cases.append(
ParsedCase(
nachname=kontakt["nachname"],
vorname=kontakt["vorname"],
geburtsdatum=kontakt["geburtsdatum"],
kvnr=kontakt["kvnr"],
thema=row.get("Thema", "").strip(),
fallgruppe=fallgruppe,
datum=datum,
jahr=date_to_jahr(datum),
kw=date_to_kw(datum),
crm_ticket_id=row.get("Name", "").strip() or None,
)
)
except Exception as e:
errors.append(f"Row {i}: {e}")
logger.warning("CSV parse error in %s row %d: %s", filename, i, e)
logger.info(
"CSV parsing of '%s' complete: %d parsed, %d skipped, %d errors",
filename,
len(cases),
skipped,
len(errors),
)
return cases