#!/usr/bin/env python3 import argparse import json import pathlib import re import sys from typing import Any QUALIFICATION_LEVEL_MAP = { "secondary": "Secondary", "diploma/certificate": "Diploma/Certificate", "diploma": "Diploma/Certificate", "certificate": "Diploma/Certificate", "bachelor": "Bachelor", "bachelor's": "Bachelor", "bachelors": "Bachelor", "master": "Master", "master's": "Master", "masters": "Master", "phd": "PhD", "doctorate": "PhD", "other": "Other", } TOP_LEVEL_KEYS = { "version", "contact", "summary", "jobs", "education", "certifications", "projects", "skills", "languages", "interests", "otherSections", } BARE_QUESTION_RE = re.compile(r'(:\s*)\?(\s*[,}\]])') BARE_NULL_RE = re.compile(r'(:\s*)Null(\s*[,}\]])') def load_relaxed_json(path: pathlib.Path) -> Any: raw = path.read_text(encoding="utf-8", errors="replace") relaxed = BARE_QUESTION_RE.sub(r'\1null\2', raw) relaxed = BARE_NULL_RE.sub(r'\1null\2', relaxed) return json.loads(relaxed) def normalize_qualification_level(value: Any) -> Any: if value is None: return None if not isinstance(value, str): return value normalized = QUALIFICATION_LEVEL_MAP.get(value.strip().lower()) return normalized if normalized is not None else value def normalize_value(value: Any) -> Any: if isinstance(value, dict): out = {} for key, item in value.items(): if key in {"metadata", "sections"}: continue out[key] = normalize_value(item) return out if isinstance(value, list): return [normalize_value(item) for item in value] if isinstance(value, str): return value return value def normalize_fixture(payload: dict[str, Any]) -> dict[str, Any]: cleaned = {key: normalize_value(value) for key, value in payload.items() if key in TOP_LEVEL_KEYS} cleaned.setdefault("version", "1") cleaned.setdefault("contact", {}) cleaned.setdefault("summary", []) cleaned.setdefault("jobs", []) cleaned.setdefault("education", []) cleaned.setdefault("certifications", []) cleaned.setdefault("projects", []) cleaned.setdefault("skills", []) cleaned.setdefault("languages", []) cleaned.setdefault("interests", []) cleaned.setdefault("otherSections", []) for education in cleaned.get("education", []): if isinstance(education, dict): education["qualificationLevel"] = normalize_qualification_level(education.get("qualificationLevel")) return cleaned def main() -> int: parser = argparse.ArgumentParser(description="Prepare hand-edited CV JSON fixtures for benchmark comparison.") parser.add_argument("--source", default="/home/pi/cvs/jsons", help="Directory containing hand-edited JSON fixtures") parser.add_argument("--output", default="/home/pi/cvs/approved-jsons", help="Directory to write cleaned approved fixtures") args = parser.parse_args() source = pathlib.Path(args.source) output = pathlib.Path(args.output) if not source.is_dir(): print(f"Source directory not found: {source}", file=sys.stderr) return 1 output.mkdir(parents=True, exist_ok=True) files = sorted(source.glob("*.json")) if not files: print(f"No JSON files found in {source}", file=sys.stderr) return 1 for path in files: try: payload = load_relaxed_json(path) except json.JSONDecodeError as exc: print(f"Invalid JSON in {path.name}: line {exc.lineno}, column {exc.colno}: {exc.msg}", file=sys.stderr) continue if not isinstance(payload, dict): print(f"Skipping non-object JSON fixture: {path.name}", file=sys.stderr) continue normalized = normalize_fixture(payload) destination = output / path.name destination.write_text(json.dumps(normalized, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") print(f"Prepared {destination}") return 0 if __name__ == "__main__": raise SystemExit(main())