refactor, security updates, cv extraction upgrades
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
QUALIFICATION_LEVEL_MAP = {
|
||||
"secondary": "Secondary",
|
||||
"diploma/certificate": "Diploma/Certificate",
|
||||
"diploma": "Diploma/Certificate",
|
||||
"certificate": "Diploma/Certificate",
|
||||
"bachelor": "Bachelor",
|
||||
"bachelor's": "Bachelor",
|
||||
"bachelors": "Bachelor",
|
||||
"master": "Master",
|
||||
"master's": "Master",
|
||||
"masters": "Master",
|
||||
"phd": "PhD",
|
||||
"doctorate": "PhD",
|
||||
"other": "Other",
|
||||
}
|
||||
|
||||
TOP_LEVEL_KEYS = {
|
||||
"version",
|
||||
"contact",
|
||||
"summary",
|
||||
"jobs",
|
||||
"education",
|
||||
"certifications",
|
||||
"projects",
|
||||
"skills",
|
||||
"languages",
|
||||
"interests",
|
||||
"otherSections",
|
||||
}
|
||||
|
||||
BARE_QUESTION_RE = re.compile(r'(:\s*)\?(\s*[,}\]])')
|
||||
BARE_NULL_RE = re.compile(r'(:\s*)Null(\s*[,}\]])')
|
||||
|
||||
|
||||
def load_relaxed_json(path: pathlib.Path) -> Any:
|
||||
raw = path.read_text(encoding="utf-8", errors="replace")
|
||||
relaxed = BARE_QUESTION_RE.sub(r'\1null\2', raw)
|
||||
relaxed = BARE_NULL_RE.sub(r'\1null\2', relaxed)
|
||||
return json.loads(relaxed)
|
||||
|
||||
|
||||
def normalize_qualification_level(value: Any) -> Any:
|
||||
if value is None:
|
||||
return None
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
normalized = QUALIFICATION_LEVEL_MAP.get(value.strip().lower())
|
||||
return normalized if normalized is not None else value
|
||||
|
||||
|
||||
def normalize_value(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
out = {}
|
||||
for key, item in value.items():
|
||||
if key in {"metadata", "sections"}:
|
||||
continue
|
||||
out[key] = normalize_value(item)
|
||||
return out
|
||||
if isinstance(value, list):
|
||||
return [normalize_value(item) for item in value]
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
def normalize_fixture(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
cleaned = {key: normalize_value(value) for key, value in payload.items() if key in TOP_LEVEL_KEYS}
|
||||
|
||||
cleaned.setdefault("version", "1")
|
||||
cleaned.setdefault("contact", {})
|
||||
cleaned.setdefault("summary", [])
|
||||
cleaned.setdefault("jobs", [])
|
||||
cleaned.setdefault("education", [])
|
||||
cleaned.setdefault("certifications", [])
|
||||
cleaned.setdefault("projects", [])
|
||||
cleaned.setdefault("skills", [])
|
||||
cleaned.setdefault("languages", [])
|
||||
cleaned.setdefault("interests", [])
|
||||
cleaned.setdefault("otherSections", [])
|
||||
|
||||
for education in cleaned.get("education", []):
|
||||
if isinstance(education, dict):
|
||||
education["qualificationLevel"] = normalize_qualification_level(education.get("qualificationLevel"))
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Prepare hand-edited CV JSON fixtures for benchmark comparison.")
|
||||
parser.add_argument("--source", default="/home/pi/cvs/jsons", help="Directory containing hand-edited JSON fixtures")
|
||||
parser.add_argument("--output", default="/home/pi/cvs/approved-jsons", help="Directory to write cleaned approved fixtures")
|
||||
args = parser.parse_args()
|
||||
|
||||
source = pathlib.Path(args.source)
|
||||
output = pathlib.Path(args.output)
|
||||
|
||||
if not source.is_dir():
|
||||
print(f"Source directory not found: {source}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
files = sorted(source.glob("*.json"))
|
||||
if not files:
|
||||
print(f"No JSON files found in {source}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
for path in files:
|
||||
try:
|
||||
payload = load_relaxed_json(path)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"Invalid JSON in {path.name}: line {exc.lineno}, column {exc.colno}: {exc.msg}", file=sys.stderr)
|
||||
continue
|
||||
if not isinstance(payload, dict):
|
||||
print(f"Skipping non-object JSON fixture: {path.name}", file=sys.stderr)
|
||||
continue
|
||||
normalized = normalize_fixture(payload)
|
||||
destination = output / path.name
|
||||
destination.write_text(json.dumps(normalized, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
print(f"Prepared {destination}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user