172 lines
7.2 KiB
Python
172 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import importlib.util
|
|
import json
|
|
import pathlib
|
|
import sys
|
|
from typing import Any
|
|
|
|
HELPER_PATH = pathlib.Path(__file__).with_name("prepare-cv-approved-fixtures.py")
|
|
HELPER_SPEC = importlib.util.spec_from_file_location("prepare_cv_approved_fixtures", HELPER_PATH)
|
|
if HELPER_SPEC is None or HELPER_SPEC.loader is None:
|
|
raise RuntimeError(f"Unable to load helper script: {HELPER_PATH}")
|
|
HELPER_MODULE = importlib.util.module_from_spec(HELPER_SPEC)
|
|
HELPER_SPEC.loader.exec_module(HELPER_MODULE)
|
|
normalize_fixture = HELPER_MODULE.normalize_fixture
|
|
|
|
|
|
def load_json(path: pathlib.Path) -> Any:
|
|
return json.loads(path.read_text(encoding="utf-8", errors="replace"))
|
|
|
|
|
|
def diff_values(expected: Any, actual: Any, path: str = "$") -> list[dict[str, Any]]:
|
|
diffs: list[dict[str, Any]] = []
|
|
|
|
if type(expected) is not type(actual):
|
|
diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "type-mismatch"})
|
|
return diffs
|
|
|
|
if isinstance(expected, dict):
|
|
expected_keys = set(expected.keys())
|
|
actual_keys = set(actual.keys())
|
|
for key in sorted(expected_keys | actual_keys):
|
|
child_path = f"{path}.{key}"
|
|
if key not in expected:
|
|
diffs.append({"path": child_path, "expected": None, "actual": actual[key], "kind": "unexpected-key"})
|
|
continue
|
|
if key not in actual:
|
|
diffs.append({"path": child_path, "expected": expected[key], "actual": None, "kind": "missing-key"})
|
|
continue
|
|
diffs.extend(diff_values(expected[key], actual[key], child_path))
|
|
return diffs
|
|
|
|
if isinstance(expected, list):
|
|
if len(expected) != len(actual):
|
|
diffs.append({"path": path, "expected": f"len={len(expected)}", "actual": f"len={len(actual)}", "kind": "length-mismatch"})
|
|
for index, (expected_item, actual_item) in enumerate(zip(expected, actual)):
|
|
diffs.extend(diff_values(expected_item, actual_item, f"{path}[{index}]"))
|
|
return diffs
|
|
|
|
if expected != actual:
|
|
diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "value-mismatch"})
|
|
return diffs
|
|
|
|
|
|
def summarize_sections(diffs: list[dict[str, Any]]) -> dict[str, int]:
|
|
sections: dict[str, int] = {}
|
|
for diff in diffs:
|
|
path = diff["path"]
|
|
remainder = path[2:] if path.startswith('$.') else path
|
|
head = remainder.split('.', 1)[0].split('[', 1)[0]
|
|
sections[head] = sections.get(head, 0) + 1
|
|
return dict(sorted(sections.items(), key=lambda item: (-item[1], item[0])))
|
|
|
|
|
|
def render_markdown(results: list[dict[str, Any]], missing_expected: list[str], missing_actual: list[str], approved_dir: pathlib.Path, actual_dir: pathlib.Path) -> str:
|
|
total = len(results)
|
|
matched = sum(1 for result in results if result["diffCount"] == 0)
|
|
lines = [
|
|
"# CV fixture comparison report",
|
|
"",
|
|
f"- Approved fixtures: `{approved_dir}`",
|
|
f"- Actual outputs: `{actual_dir}`",
|
|
f"- Compared files: {total}",
|
|
f"- Exact matches: {matched}",
|
|
f"- Files with differences: {total - matched}",
|
|
"",
|
|
]
|
|
|
|
if missing_expected:
|
|
lines.append("## Missing approved fixtures")
|
|
lines.extend(f"- {name}" for name in missing_expected)
|
|
lines.append("")
|
|
|
|
if missing_actual:
|
|
lines.append("## Missing actual outputs")
|
|
lines.extend(f"- {name}" for name in missing_actual)
|
|
lines.append("")
|
|
|
|
lines.append("## File results")
|
|
lines.append("")
|
|
lines.append("| File | Diffs | Top sections |")
|
|
lines.append("|---|---:|---|")
|
|
for result in results:
|
|
section_summary = ", ".join(f"{key}:{value}" for key, value in result["sectionCounts"].items()) or "—"
|
|
lines.append(f"| {result['file']} | {result['diffCount']} | {section_summary} |")
|
|
|
|
lines.append("")
|
|
lines.append("## Detailed mismatches")
|
|
lines.append("")
|
|
for result in results:
|
|
if result["diffCount"] == 0:
|
|
continue
|
|
lines.append(f"### {result['file']}")
|
|
for diff in result["diffs"][:50]:
|
|
lines.append(f"- `{diff['path']}` ({diff['kind']})")
|
|
lines.append(f" - expected: `{json.dumps(diff['expected'], ensure_ascii=False)}`")
|
|
lines.append(f" - actual: `{json.dumps(diff['actual'], ensure_ascii=False)}`")
|
|
if result["diffCount"] > 50:
|
|
lines.append(f"- … {result['diffCount'] - 50} more differences omitted")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Compare actual CV extractor outputs against approved fixtures.")
|
|
parser.add_argument("--approved", default="/home/pi/cvs/approved-jsons", help="Directory containing approved fixture JSON files")
|
|
parser.add_argument("--actual", required=True, help="Directory containing actual extractor output JSON files")
|
|
parser.add_argument("--output", help="Directory to write comparison artifacts. Defaults to <actual>/../comparison")
|
|
args = parser.parse_args()
|
|
|
|
approved_dir = pathlib.Path(args.approved)
|
|
actual_dir = pathlib.Path(args.actual)
|
|
output_dir = pathlib.Path(args.output) if args.output else actual_dir.parent / "comparison"
|
|
|
|
if not approved_dir.is_dir():
|
|
print(f"Approved fixture directory not found: {approved_dir}", file=sys.stderr)
|
|
return 1
|
|
if not actual_dir.is_dir():
|
|
print(f"Actual output directory not found: {actual_dir}", file=sys.stderr)
|
|
return 1
|
|
|
|
approved_files = {path.name: path for path in approved_dir.glob("*.json")}
|
|
actual_files = {path.name: path for path in actual_dir.glob("*.json")}
|
|
|
|
common_names = sorted(set(approved_files) & set(actual_files))
|
|
missing_expected = sorted(set(actual_files) - set(approved_files))
|
|
missing_actual = sorted(set(approved_files) - set(actual_files))
|
|
|
|
results: list[dict[str, Any]] = []
|
|
for name in common_names:
|
|
expected = normalize_fixture(load_json(approved_files[name]))
|
|
actual = normalize_fixture(load_json(actual_files[name]))
|
|
diffs = diff_values(expected, actual)
|
|
results.append({
|
|
"file": name,
|
|
"diffCount": len(diffs),
|
|
"sectionCounts": summarize_sections(diffs),
|
|
"diffs": diffs,
|
|
})
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
(output_dir / "comparison.json").write_text(json.dumps({
|
|
"approvedDir": str(approved_dir),
|
|
"actualDir": str(actual_dir),
|
|
"comparedFiles": len(results),
|
|
"missingApprovedFixtures": missing_expected,
|
|
"missingActualOutputs": missing_actual,
|
|
"results": results,
|
|
}, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
(output_dir / "comparison.md").write_text(render_markdown(results, missing_expected, missing_actual, approved_dir, actual_dir) + "\n", encoding="utf-8")
|
|
|
|
print(f"Comparison JSON: {output_dir / 'comparison.json'}")
|
|
print(f"Comparison markdown: {output_dir / 'comparison.md'}")
|
|
print(f"Compared files: {len(results)}")
|
|
print(f"Files with differences: {sum(1 for result in results if result['diffCount'] > 0)}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|