#!/usr/bin/env python3 import argparse import importlib.util import json import pathlib import sys from typing import Any HELPER_PATH = pathlib.Path(__file__).with_name("prepare-cv-approved-fixtures.py") HELPER_SPEC = importlib.util.spec_from_file_location("prepare_cv_approved_fixtures", HELPER_PATH) if HELPER_SPEC is None or HELPER_SPEC.loader is None: raise RuntimeError(f"Unable to load helper script: {HELPER_PATH}") HELPER_MODULE = importlib.util.module_from_spec(HELPER_SPEC) HELPER_SPEC.loader.exec_module(HELPER_MODULE) normalize_fixture = HELPER_MODULE.normalize_fixture def load_json(path: pathlib.Path) -> Any: return json.loads(path.read_text(encoding="utf-8", errors="replace")) def diff_values(expected: Any, actual: Any, path: str = "$") -> list[dict[str, Any]]: diffs: list[dict[str, Any]] = [] if type(expected) is not type(actual): diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "type-mismatch"}) return diffs if isinstance(expected, dict): expected_keys = set(expected.keys()) actual_keys = set(actual.keys()) for key in sorted(expected_keys | actual_keys): child_path = f"{path}.{key}" if key not in expected: diffs.append({"path": child_path, "expected": None, "actual": actual[key], "kind": "unexpected-key"}) continue if key not in actual: diffs.append({"path": child_path, "expected": expected[key], "actual": None, "kind": "missing-key"}) continue diffs.extend(diff_values(expected[key], actual[key], child_path)) return diffs if isinstance(expected, list): if len(expected) != len(actual): diffs.append({"path": path, "expected": f"len={len(expected)}", "actual": f"len={len(actual)}", "kind": "length-mismatch"}) for index, (expected_item, actual_item) in enumerate(zip(expected, actual)): diffs.extend(diff_values(expected_item, actual_item, f"{path}[{index}]")) return diffs if expected != actual: diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "value-mismatch"}) return diffs def summarize_sections(diffs: list[dict[str, Any]]) -> dict[str, int]: sections: dict[str, int] = {} for diff in diffs: path = diff["path"] remainder = path[2:] if path.startswith('$.') else path head = remainder.split('.', 1)[0].split('[', 1)[0] sections[head] = sections.get(head, 0) + 1 return dict(sorted(sections.items(), key=lambda item: (-item[1], item[0]))) def render_markdown(results: list[dict[str, Any]], missing_expected: list[str], missing_actual: list[str], approved_dir: pathlib.Path, actual_dir: pathlib.Path) -> str: total = len(results) matched = sum(1 for result in results if result["diffCount"] == 0) lines = [ "# CV fixture comparison report", "", f"- Approved fixtures: `{approved_dir}`", f"- Actual outputs: `{actual_dir}`", f"- Compared files: {total}", f"- Exact matches: {matched}", f"- Files with differences: {total - matched}", "", ] if missing_expected: lines.append("## Missing approved fixtures") lines.extend(f"- {name}" for name in missing_expected) lines.append("") if missing_actual: lines.append("## Missing actual outputs") lines.extend(f"- {name}" for name in missing_actual) lines.append("") lines.append("## File results") lines.append("") lines.append("| File | Diffs | Top sections |") lines.append("|---|---:|---|") for result in results: section_summary = ", ".join(f"{key}:{value}" for key, value in result["sectionCounts"].items()) or "—" lines.append(f"| {result['file']} | {result['diffCount']} | {section_summary} |") lines.append("") lines.append("## Detailed mismatches") lines.append("") for result in results: if result["diffCount"] == 0: continue lines.append(f"### {result['file']}") for diff in result["diffs"][:50]: lines.append(f"- `{diff['path']}` ({diff['kind']})") lines.append(f" - expected: `{json.dumps(diff['expected'], ensure_ascii=False)}`") lines.append(f" - actual: `{json.dumps(diff['actual'], ensure_ascii=False)}`") if result["diffCount"] > 50: lines.append(f"- … {result['diffCount'] - 50} more differences omitted") lines.append("") return "\n".join(lines) def main() -> int: parser = argparse.ArgumentParser(description="Compare actual CV extractor outputs against approved fixtures.") parser.add_argument("--approved", default="/home/pi/cvs/approved-jsons", help="Directory containing approved fixture JSON files") parser.add_argument("--actual", required=True, help="Directory containing actual extractor output JSON files") parser.add_argument("--output", help="Directory to write comparison artifacts. Defaults to /../comparison") args = parser.parse_args() approved_dir = pathlib.Path(args.approved) actual_dir = pathlib.Path(args.actual) output_dir = pathlib.Path(args.output) if args.output else actual_dir.parent / "comparison" if not approved_dir.is_dir(): print(f"Approved fixture directory not found: {approved_dir}", file=sys.stderr) return 1 if not actual_dir.is_dir(): print(f"Actual output directory not found: {actual_dir}", file=sys.stderr) return 1 approved_files = {path.name: path for path in approved_dir.glob("*.json")} actual_files = {path.name: path for path in actual_dir.glob("*.json")} common_names = sorted(set(approved_files) & set(actual_files)) missing_expected = sorted(set(actual_files) - set(approved_files)) missing_actual = sorted(set(approved_files) - set(actual_files)) results: list[dict[str, Any]] = [] for name in common_names: expected = normalize_fixture(load_json(approved_files[name])) actual = normalize_fixture(load_json(actual_files[name])) diffs = diff_values(expected, actual) results.append({ "file": name, "diffCount": len(diffs), "sectionCounts": summarize_sections(diffs), "diffs": diffs, }) output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "comparison.json").write_text(json.dumps({ "approvedDir": str(approved_dir), "actualDir": str(actual_dir), "comparedFiles": len(results), "missingApprovedFixtures": missing_expected, "missingActualOutputs": missing_actual, "results": results, }, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") (output_dir / "comparison.md").write_text(render_markdown(results, missing_expected, missing_actual, approved_dir, actual_dir) + "\n", encoding="utf-8") print(f"Comparison JSON: {output_dir / 'comparison.json'}") print(f"Comparison markdown: {output_dir / 'comparison.md'}") print(f"Compared files: {len(results)}") print(f"Files with differences: {sum(1 for result in results if result['diffCount'] > 0)}") return 0 if __name__ == "__main__": raise SystemExit(main())