135 lines
4.1 KiB
Python
135 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import pathlib
|
|
import re
|
|
import sys
|
|
from typing import Any
|
|
|
|
QUALIFICATION_LEVEL_MAP = {
|
|
"secondary": "Secondary",
|
|
"diploma/certificate": "Diploma/Certificate",
|
|
"diploma": "Diploma/Certificate",
|
|
"certificate": "Diploma/Certificate",
|
|
"bachelor": "Bachelor",
|
|
"bachelor's": "Bachelor",
|
|
"bachelors": "Bachelor",
|
|
"master": "Master",
|
|
"master's": "Master",
|
|
"masters": "Master",
|
|
"phd": "PhD",
|
|
"doctorate": "PhD",
|
|
"other": "Other",
|
|
}
|
|
|
|
TOP_LEVEL_KEYS = {
|
|
"version",
|
|
"contact",
|
|
"summary",
|
|
"jobs",
|
|
"education",
|
|
"certifications",
|
|
"projects",
|
|
"skills",
|
|
"languages",
|
|
"interests",
|
|
"otherSections",
|
|
}
|
|
|
|
BARE_QUESTION_RE = re.compile(r'(:\s*)\?(\s*[,}\]])')
|
|
BARE_NULL_RE = re.compile(r'(:\s*)Null(\s*[,}\]])')
|
|
|
|
|
|
def load_relaxed_json(path: pathlib.Path) -> Any:
|
|
raw = path.read_text(encoding="utf-8", errors="replace")
|
|
relaxed = BARE_QUESTION_RE.sub(r'\1null\2', raw)
|
|
relaxed = BARE_NULL_RE.sub(r'\1null\2', relaxed)
|
|
return json.loads(relaxed)
|
|
|
|
|
|
def normalize_qualification_level(value: Any) -> Any:
|
|
if value is None:
|
|
return None
|
|
if not isinstance(value, str):
|
|
return value
|
|
normalized = QUALIFICATION_LEVEL_MAP.get(value.strip().lower())
|
|
return normalized if normalized is not None else value
|
|
|
|
|
|
def normalize_value(value: Any) -> Any:
|
|
if isinstance(value, dict):
|
|
out = {}
|
|
for key, item in value.items():
|
|
if key in {"metadata", "sections"}:
|
|
continue
|
|
out[key] = normalize_value(item)
|
|
return out
|
|
if isinstance(value, list):
|
|
return [normalize_value(item) for item in value]
|
|
if isinstance(value, str):
|
|
return value
|
|
return value
|
|
|
|
|
|
def normalize_fixture(payload: dict[str, Any]) -> dict[str, Any]:
|
|
cleaned = {key: normalize_value(value) for key, value in payload.items() if key in TOP_LEVEL_KEYS}
|
|
|
|
cleaned.setdefault("version", "1")
|
|
cleaned.setdefault("contact", {})
|
|
cleaned.setdefault("summary", [])
|
|
cleaned.setdefault("jobs", [])
|
|
cleaned.setdefault("education", [])
|
|
cleaned.setdefault("certifications", [])
|
|
cleaned.setdefault("projects", [])
|
|
cleaned.setdefault("skills", [])
|
|
cleaned.setdefault("languages", [])
|
|
cleaned.setdefault("interests", [])
|
|
cleaned.setdefault("otherSections", [])
|
|
|
|
for education in cleaned.get("education", []):
|
|
if isinstance(education, dict):
|
|
education["qualificationLevel"] = normalize_qualification_level(education.get("qualificationLevel"))
|
|
|
|
return cleaned
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Prepare hand-edited CV JSON fixtures for benchmark comparison.")
|
|
parser.add_argument("--source", default="/home/pi/cvs/jsons", help="Directory containing hand-edited JSON fixtures")
|
|
parser.add_argument("--output", default="/home/pi/cvs/approved-jsons", help="Directory to write cleaned approved fixtures")
|
|
args = parser.parse_args()
|
|
|
|
source = pathlib.Path(args.source)
|
|
output = pathlib.Path(args.output)
|
|
|
|
if not source.is_dir():
|
|
print(f"Source directory not found: {source}", file=sys.stderr)
|
|
return 1
|
|
|
|
output.mkdir(parents=True, exist_ok=True)
|
|
|
|
files = sorted(source.glob("*.json"))
|
|
if not files:
|
|
print(f"No JSON files found in {source}", file=sys.stderr)
|
|
return 1
|
|
|
|
for path in files:
|
|
try:
|
|
payload = load_relaxed_json(path)
|
|
except json.JSONDecodeError as exc:
|
|
print(f"Invalid JSON in {path.name}: line {exc.lineno}, column {exc.colno}: {exc.msg}", file=sys.stderr)
|
|
continue
|
|
if not isinstance(payload, dict):
|
|
print(f"Skipping non-object JSON fixture: {path.name}", file=sys.stderr)
|
|
continue
|
|
normalized = normalize_fixture(payload)
|
|
destination = output / path.name
|
|
destination.write_text(json.dumps(normalized, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
print(f"Prepared {destination}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|