mirror of
https://github.com/complexcaresolutions/dak.c2s.git
synced 2026-03-17 23:03:41 +00:00
Parse CRM CSV exports (UTF-8-BOM, comma-delimited) with: - Pipe-delimited Hauptkontakt field (Nachname|Vorname|Geburtsdatum|KVNR) - German date formats (DD.MM.YYYY, DD.MM.YY, HH:MM) - Modul-to-Fallgruppe mapping - Graceful handling of missing KVNR, bad dates, empty fields, spam rows - 19 tests (synthetic + all 4 real CSV files) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
143 lines
4.5 KiB
Python
143 lines
4.5 KiB
Python
"""CRM CSV parser for DAK Zweitmeinungs-Portal.
|
|
|
|
Parses CRM CSV exports with:
|
|
- UTF-8-BOM encoding
|
|
- Comma-delimited columns: Hauptkontakt, Name, Thema, Erstellungsdatum, Modul
|
|
- Pipe-delimited Hauptkontakt: "Nachname | Vorname | Geburtsdatum | KVNR"
|
|
- German date formats: DD.MM.YYYY (Geburtsdatum), DD.MM.YY, HH:MM (Erstellungsdatum)
|
|
- Modul-to-Fallgruppe mapping
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
from typing import Optional
|
|
|
|
from app.utils.fallgruppe_map import map_modul_to_fallgruppe
|
|
from app.utils.kw_utils import date_to_jahr, date_to_kw, parse_german_date
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ParsedCase:
|
|
"""A single parsed case from a CRM CSV row."""
|
|
|
|
nachname: str
|
|
vorname: Optional[str]
|
|
geburtsdatum: Optional[date]
|
|
kvnr: Optional[str]
|
|
thema: str
|
|
fallgruppe: str
|
|
datum: date
|
|
jahr: int
|
|
kw: int
|
|
crm_ticket_id: Optional[str]
|
|
|
|
|
|
def parse_hauptkontakt(raw: str) -> dict:
|
|
"""Parse pipe-delimited contact string.
|
|
|
|
Format: "Nachname | Vorname | Geburtsdatum | KVNR"
|
|
|
|
Edge cases handled:
|
|
- Missing KVNR: "Daum | Luana | 05.02.2016 |"
|
|
- Missing Geburtsdatum: "Schaumann | Janina | |"
|
|
- Bad date: "Krölls | Peter | 29.08.0196 | S361390622"
|
|
- Missing Vorname: "Wuffy | | |"
|
|
"""
|
|
parts = [p.strip() for p in raw.split("|")]
|
|
result = {
|
|
"nachname": parts[0] if len(parts) > 0 else "",
|
|
"vorname": parts[1] if len(parts) > 1 and parts[1] else None,
|
|
"geburtsdatum": None,
|
|
"kvnr": parts[3] if len(parts) > 3 and parts[3] else None,
|
|
}
|
|
if len(parts) > 2 and parts[2]:
|
|
try:
|
|
result["geburtsdatum"] = parse_german_date(parts[2])
|
|
except (ValueError, Exception) as e:
|
|
logger.warning("Could not parse Geburtsdatum '%s': %s", parts[2], e)
|
|
return result
|
|
|
|
|
|
def parse_csv(content: bytes, filename: str = "") -> list[ParsedCase]:
|
|
"""Parse CRM CSV file content into list of ParsedCase objects.
|
|
|
|
Args:
|
|
content: Raw bytes of the CSV file (UTF-8-BOM encoded).
|
|
filename: Optional filename for logging context.
|
|
|
|
Returns:
|
|
List of successfully parsed cases. Rows with empty/unmappable Modul
|
|
are skipped (logged as warnings). Other parse errors are also skipped
|
|
and logged.
|
|
"""
|
|
text = content.decode("utf-8-sig") # Handle BOM
|
|
reader = csv.DictReader(io.StringIO(text))
|
|
cases: list[ParsedCase] = []
|
|
errors: list[str] = []
|
|
skipped = 0
|
|
|
|
for i, row in enumerate(reader, start=2): # row 1 is header
|
|
try:
|
|
# Parse pipe-delimited contact field
|
|
kontakt = parse_hauptkontakt(row.get("Hauptkontakt", ""))
|
|
|
|
# Parse creation date
|
|
datum_str = row.get("Erstellungsdatum", "").strip()
|
|
if datum_str:
|
|
datum = parse_german_date(datum_str)
|
|
else:
|
|
datum = date.today()
|
|
|
|
# Map Modul to Fallgruppe -- skip rows with empty/unknown Modul
|
|
modul = row.get("Modul", "").strip()
|
|
if not modul:
|
|
skipped += 1
|
|
logger.debug(
|
|
"Skipping row %d in %s: empty Modul field", i, filename
|
|
)
|
|
continue
|
|
|
|
try:
|
|
fallgruppe = map_modul_to_fallgruppe(modul)
|
|
except ValueError:
|
|
skipped += 1
|
|
logger.warning(
|
|
"Skipping row %d in %s: unmappable Modul '%s'",
|
|
i,
|
|
filename,
|
|
modul,
|
|
)
|
|
continue
|
|
|
|
cases.append(
|
|
ParsedCase(
|
|
nachname=kontakt["nachname"],
|
|
vorname=kontakt["vorname"],
|
|
geburtsdatum=kontakt["geburtsdatum"],
|
|
kvnr=kontakt["kvnr"],
|
|
thema=row.get("Thema", "").strip(),
|
|
fallgruppe=fallgruppe,
|
|
datum=datum,
|
|
jahr=date_to_jahr(datum),
|
|
kw=date_to_kw(datum),
|
|
crm_ticket_id=row.get("Name", "").strip() or None,
|
|
)
|
|
)
|
|
except Exception as e:
|
|
errors.append(f"Row {i}: {e}")
|
|
logger.warning("CSV parse error in %s row %d: %s", filename, i, e)
|
|
|
|
logger.info(
|
|
"CSV parsing of '%s' complete: %d parsed, %d skipped, %d errors",
|
|
filename,
|
|
len(cases),
|
|
skipped,
|
|
len(errors),
|
|
)
|
|
|
|
return cases
|