refactor, security updates, cv extraction upgrades
This commit is contained in:
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import importlib.util
|
||||
import json
|
||||
import pathlib
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
HELPER_PATH = pathlib.Path(__file__).with_name("prepare-cv-approved-fixtures.py")
|
||||
HELPER_SPEC = importlib.util.spec_from_file_location("prepare_cv_approved_fixtures", HELPER_PATH)
|
||||
if HELPER_SPEC is None or HELPER_SPEC.loader is None:
|
||||
raise RuntimeError(f"Unable to load helper script: {HELPER_PATH}")
|
||||
HELPER_MODULE = importlib.util.module_from_spec(HELPER_SPEC)
|
||||
HELPER_SPEC.loader.exec_module(HELPER_MODULE)
|
||||
normalize_fixture = HELPER_MODULE.normalize_fixture
|
||||
|
||||
|
||||
def load_json(path: pathlib.Path) -> Any:
|
||||
return json.loads(path.read_text(encoding="utf-8", errors="replace"))
|
||||
|
||||
|
||||
def diff_values(expected: Any, actual: Any, path: str = "$") -> list[dict[str, Any]]:
|
||||
diffs: list[dict[str, Any]] = []
|
||||
|
||||
if type(expected) is not type(actual):
|
||||
diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "type-mismatch"})
|
||||
return diffs
|
||||
|
||||
if isinstance(expected, dict):
|
||||
expected_keys = set(expected.keys())
|
||||
actual_keys = set(actual.keys())
|
||||
for key in sorted(expected_keys | actual_keys):
|
||||
child_path = f"{path}.{key}"
|
||||
if key not in expected:
|
||||
diffs.append({"path": child_path, "expected": None, "actual": actual[key], "kind": "unexpected-key"})
|
||||
continue
|
||||
if key not in actual:
|
||||
diffs.append({"path": child_path, "expected": expected[key], "actual": None, "kind": "missing-key"})
|
||||
continue
|
||||
diffs.extend(diff_values(expected[key], actual[key], child_path))
|
||||
return diffs
|
||||
|
||||
if isinstance(expected, list):
|
||||
if len(expected) != len(actual):
|
||||
diffs.append({"path": path, "expected": f"len={len(expected)}", "actual": f"len={len(actual)}", "kind": "length-mismatch"})
|
||||
for index, (expected_item, actual_item) in enumerate(zip(expected, actual)):
|
||||
diffs.extend(diff_values(expected_item, actual_item, f"{path}[{index}]"))
|
||||
return diffs
|
||||
|
||||
if expected != actual:
|
||||
diffs.append({"path": path, "expected": expected, "actual": actual, "kind": "value-mismatch"})
|
||||
return diffs
|
||||
|
||||
|
||||
def summarize_sections(diffs: list[dict[str, Any]]) -> dict[str, int]:
|
||||
sections: dict[str, int] = {}
|
||||
for diff in diffs:
|
||||
path = diff["path"]
|
||||
remainder = path[2:] if path.startswith('$.') else path
|
||||
head = remainder.split('.', 1)[0].split('[', 1)[0]
|
||||
sections[head] = sections.get(head, 0) + 1
|
||||
return dict(sorted(sections.items(), key=lambda item: (-item[1], item[0])))
|
||||
|
||||
|
||||
def render_markdown(results: list[dict[str, Any]], missing_expected: list[str], missing_actual: list[str], approved_dir: pathlib.Path, actual_dir: pathlib.Path) -> str:
|
||||
total = len(results)
|
||||
matched = sum(1 for result in results if result["diffCount"] == 0)
|
||||
lines = [
|
||||
"# CV fixture comparison report",
|
||||
"",
|
||||
f"- Approved fixtures: `{approved_dir}`",
|
||||
f"- Actual outputs: `{actual_dir}`",
|
||||
f"- Compared files: {total}",
|
||||
f"- Exact matches: {matched}",
|
||||
f"- Files with differences: {total - matched}",
|
||||
"",
|
||||
]
|
||||
|
||||
if missing_expected:
|
||||
lines.append("## Missing approved fixtures")
|
||||
lines.extend(f"- {name}" for name in missing_expected)
|
||||
lines.append("")
|
||||
|
||||
if missing_actual:
|
||||
lines.append("## Missing actual outputs")
|
||||
lines.extend(f"- {name}" for name in missing_actual)
|
||||
lines.append("")
|
||||
|
||||
lines.append("## File results")
|
||||
lines.append("")
|
||||
lines.append("| File | Diffs | Top sections |")
|
||||
lines.append("|---|---:|---|")
|
||||
for result in results:
|
||||
section_summary = ", ".join(f"{key}:{value}" for key, value in result["sectionCounts"].items()) or "—"
|
||||
lines.append(f"| {result['file']} | {result['diffCount']} | {section_summary} |")
|
||||
|
||||
lines.append("")
|
||||
lines.append("## Detailed mismatches")
|
||||
lines.append("")
|
||||
for result in results:
|
||||
if result["diffCount"] == 0:
|
||||
continue
|
||||
lines.append(f"### {result['file']}")
|
||||
for diff in result["diffs"][:50]:
|
||||
lines.append(f"- `{diff['path']}` ({diff['kind']})")
|
||||
lines.append(f" - expected: `{json.dumps(diff['expected'], ensure_ascii=False)}`")
|
||||
lines.append(f" - actual: `{json.dumps(diff['actual'], ensure_ascii=False)}`")
|
||||
if result["diffCount"] > 50:
|
||||
lines.append(f"- … {result['diffCount'] - 50} more differences omitted")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Compare actual CV extractor outputs against approved fixtures.")
|
||||
parser.add_argument("--approved", default="/home/pi/cvs/approved-jsons", help="Directory containing approved fixture JSON files")
|
||||
parser.add_argument("--actual", required=True, help="Directory containing actual extractor output JSON files")
|
||||
parser.add_argument("--output", help="Directory to write comparison artifacts. Defaults to <actual>/../comparison")
|
||||
args = parser.parse_args()
|
||||
|
||||
approved_dir = pathlib.Path(args.approved)
|
||||
actual_dir = pathlib.Path(args.actual)
|
||||
output_dir = pathlib.Path(args.output) if args.output else actual_dir.parent / "comparison"
|
||||
|
||||
if not approved_dir.is_dir():
|
||||
print(f"Approved fixture directory not found: {approved_dir}", file=sys.stderr)
|
||||
return 1
|
||||
if not actual_dir.is_dir():
|
||||
print(f"Actual output directory not found: {actual_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
approved_files = {path.name: path for path in approved_dir.glob("*.json")}
|
||||
actual_files = {path.name: path for path in actual_dir.glob("*.json")}
|
||||
|
||||
common_names = sorted(set(approved_files) & set(actual_files))
|
||||
missing_expected = sorted(set(actual_files) - set(approved_files))
|
||||
missing_actual = sorted(set(approved_files) - set(actual_files))
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
for name in common_names:
|
||||
expected = normalize_fixture(load_json(approved_files[name]))
|
||||
actual = normalize_fixture(load_json(actual_files[name]))
|
||||
diffs = diff_values(expected, actual)
|
||||
results.append({
|
||||
"file": name,
|
||||
"diffCount": len(diffs),
|
||||
"sectionCounts": summarize_sections(diffs),
|
||||
"diffs": diffs,
|
||||
})
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "comparison.json").write_text(json.dumps({
|
||||
"approvedDir": str(approved_dir),
|
||||
"actualDir": str(actual_dir),
|
||||
"comparedFiles": len(results),
|
||||
"missingApprovedFixtures": missing_expected,
|
||||
"missingActualOutputs": missing_actual,
|
||||
"results": results,
|
||||
}, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
(output_dir / "comparison.md").write_text(render_markdown(results, missing_expected, missing_actual, approved_dir, actual_dir) + "\n", encoding="utf-8")
|
||||
|
||||
print(f"Comparison JSON: {output_dir / 'comparison.json'}")
|
||||
print(f"Comparison markdown: {output_dir / 'comparison.md'}")
|
||||
print(f"Compared files: {len(results)}")
|
||||
print(f"Files with differences: {sum(1 for result in results if result['diffCount'] > 0)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Executable
+91
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
CORPUS_DIR="${CV_CORPUS_DIR:-$HOME/cvs}"
|
||||
TIMESTAMP="$(date +%Y%m%d-%H%M%S)"
|
||||
EXPORT_ROOT="${CV_JSON_EXPORT_DIR:-$ROOT_DIR/tmp/cv-json-export/$TIMESTAMP}"
|
||||
OUTPUT_ROOT="$EXPORT_ROOT/benchmark"
|
||||
APPROVED_DIR="$EXPORT_ROOT/approved-fixtures"
|
||||
EDITABLE_DIR="$EXPORT_ROOT/editable-json"
|
||||
DEFAULT_IGNORE_PATTERNS="cv-template.pdf,Resume.en.pdf,EPS-*.pdf"
|
||||
IGNORE_PATTERNS="${CV_BENCHMARK_IGNORE:-$DEFAULT_IGNORE_PATTERNS}"
|
||||
|
||||
resolve_dotnet() {
|
||||
if command -v dotnet >/dev/null 2>&1; then
|
||||
command -v dotnet
|
||||
return 0
|
||||
fi
|
||||
|
||||
local candidates=(
|
||||
"$HOME/.gsd/agent/bin/dotnet"
|
||||
"$HOME/.dotnet/dotnet"
|
||||
"/usr/bin/dotnet"
|
||||
"/usr/local/bin/dotnet"
|
||||
)
|
||||
|
||||
local candidate
|
||||
for candidate in "${candidates[@]}"; do
|
||||
if [[ -x "$candidate" ]]; then
|
||||
printf '%s\n' "$candidate"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
DOTNET_BIN="$(resolve_dotnet || true)"
|
||||
|
||||
if [[ ! -d "$CORPUS_DIR" ]]; then
|
||||
echo "CV corpus directory not found: $CORPUS_DIR" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "$DOTNET_BIN" ]]; then
|
||||
echo "dotnet not found on PATH or known fallback locations." >&2
|
||||
echo "Checked: ~/.gsd/agent/bin/dotnet, ~/.dotnet/dotnet, /usr/bin/dotnet, /usr/local/bin/dotnet" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$CORPUS_DIR" != "/home/pi/cvs" ]]; then
|
||||
echo "This wrapper currently relies on the existing benchmark harness, which scans /home/pi/cvs." >&2
|
||||
echo "Set up a symlink or move the corpus there, or rerun with CV_CORPUS_DIR=/home/pi/cvs." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUTPUT_ROOT" "$APPROVED_DIR" "$EDITABLE_DIR"
|
||||
|
||||
echo "Exporting structured CV JSON from: $CORPUS_DIR"
|
||||
echo "Working directory: $EXPORT_ROOT"
|
||||
echo "Ignoring files: $IGNORE_PATTERNS"
|
||||
|
||||
CV_BENCHMARK_OUTPUT_DIR="$OUTPUT_ROOT" \
|
||||
CV_BENCHMARK_APPROVED_DIR="$APPROVED_DIR" \
|
||||
CV_BENCHMARK_IGNORE="$IGNORE_PATTERNS" \
|
||||
"$DOTNET_BIN" test "$ROOT_DIR/JobTrackerApi.Tests/JobTrackerApi.Tests.csproj" --filter CvCorpusHarnessTests /p:DisableSourceControlManagerQueries=true
|
||||
|
||||
if [[ -d "$OUTPUT_ROOT/outputs" ]]; then
|
||||
find "$OUTPUT_ROOT/outputs" -maxdepth 1 -name '*.json' -print0 | while IFS= read -r -d '' file; do
|
||||
base="$(basename "$file")"
|
||||
cp "$file" "$EDITABLE_DIR/$base"
|
||||
done
|
||||
fi
|
||||
|
||||
cat <<EOF
|
||||
|
||||
Done.
|
||||
|
||||
Generated files:
|
||||
- Latest parser output: $OUTPUT_ROOT/outputs
|
||||
- Editable copies: $EDITABLE_DIR
|
||||
- Candidate fixtures: $OUTPUT_ROOT/candidate-fixtures
|
||||
- Summary report: $OUTPUT_ROOT/report.md
|
||||
- Machine index: $OUTPUT_ROOT/index.json
|
||||
|
||||
Recommended workflow:
|
||||
1. Edit files in: $EDITABLE_DIR
|
||||
2. Keep only the fields you really want the extractor to produce.
|
||||
3. Once reviewed, copy the corrected files into an approved fixtures directory.
|
||||
4. Re-run the benchmark and compare actual vs approved output.
|
||||
EOF
|
||||
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
QUALIFICATION_LEVEL_MAP = {
|
||||
"secondary": "Secondary",
|
||||
"diploma/certificate": "Diploma/Certificate",
|
||||
"diploma": "Diploma/Certificate",
|
||||
"certificate": "Diploma/Certificate",
|
||||
"bachelor": "Bachelor",
|
||||
"bachelor's": "Bachelor",
|
||||
"bachelors": "Bachelor",
|
||||
"master": "Master",
|
||||
"master's": "Master",
|
||||
"masters": "Master",
|
||||
"phd": "PhD",
|
||||
"doctorate": "PhD",
|
||||
"other": "Other",
|
||||
}
|
||||
|
||||
TOP_LEVEL_KEYS = {
|
||||
"version",
|
||||
"contact",
|
||||
"summary",
|
||||
"jobs",
|
||||
"education",
|
||||
"certifications",
|
||||
"projects",
|
||||
"skills",
|
||||
"languages",
|
||||
"interests",
|
||||
"otherSections",
|
||||
}
|
||||
|
||||
BARE_QUESTION_RE = re.compile(r'(:\s*)\?(\s*[,}\]])')
|
||||
BARE_NULL_RE = re.compile(r'(:\s*)Null(\s*[,}\]])')
|
||||
|
||||
|
||||
def load_relaxed_json(path: pathlib.Path) -> Any:
|
||||
raw = path.read_text(encoding="utf-8", errors="replace")
|
||||
relaxed = BARE_QUESTION_RE.sub(r'\1null\2', raw)
|
||||
relaxed = BARE_NULL_RE.sub(r'\1null\2', relaxed)
|
||||
return json.loads(relaxed)
|
||||
|
||||
|
||||
def normalize_qualification_level(value: Any) -> Any:
|
||||
if value is None:
|
||||
return None
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
normalized = QUALIFICATION_LEVEL_MAP.get(value.strip().lower())
|
||||
return normalized if normalized is not None else value
|
||||
|
||||
|
||||
def normalize_value(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
out = {}
|
||||
for key, item in value.items():
|
||||
if key in {"metadata", "sections"}:
|
||||
continue
|
||||
out[key] = normalize_value(item)
|
||||
return out
|
||||
if isinstance(value, list):
|
||||
return [normalize_value(item) for item in value]
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
def normalize_fixture(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
cleaned = {key: normalize_value(value) for key, value in payload.items() if key in TOP_LEVEL_KEYS}
|
||||
|
||||
cleaned.setdefault("version", "1")
|
||||
cleaned.setdefault("contact", {})
|
||||
cleaned.setdefault("summary", [])
|
||||
cleaned.setdefault("jobs", [])
|
||||
cleaned.setdefault("education", [])
|
||||
cleaned.setdefault("certifications", [])
|
||||
cleaned.setdefault("projects", [])
|
||||
cleaned.setdefault("skills", [])
|
||||
cleaned.setdefault("languages", [])
|
||||
cleaned.setdefault("interests", [])
|
||||
cleaned.setdefault("otherSections", [])
|
||||
|
||||
for education in cleaned.get("education", []):
|
||||
if isinstance(education, dict):
|
||||
education["qualificationLevel"] = normalize_qualification_level(education.get("qualificationLevel"))
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Prepare hand-edited CV JSON fixtures for benchmark comparison.")
|
||||
parser.add_argument("--source", default="/home/pi/cvs/jsons", help="Directory containing hand-edited JSON fixtures")
|
||||
parser.add_argument("--output", default="/home/pi/cvs/approved-jsons", help="Directory to write cleaned approved fixtures")
|
||||
args = parser.parse_args()
|
||||
|
||||
source = pathlib.Path(args.source)
|
||||
output = pathlib.Path(args.output)
|
||||
|
||||
if not source.is_dir():
|
||||
print(f"Source directory not found: {source}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
files = sorted(source.glob("*.json"))
|
||||
if not files:
|
||||
print(f"No JSON files found in {source}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
for path in files:
|
||||
try:
|
||||
payload = load_relaxed_json(path)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"Invalid JSON in {path.name}: line {exc.lineno}, column {exc.colno}: {exc.msg}", file=sys.stderr)
|
||||
continue
|
||||
if not isinstance(payload, dict):
|
||||
print(f"Skipping non-object JSON fixture: {path.name}", file=sys.stderr)
|
||||
continue
|
||||
normalized = normalize_fixture(payload)
|
||||
destination = output / path.name
|
||||
destination.write_text(json.dumps(normalized, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
print(f"Prepared {destination}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user