Files
jobtrackingapp/scripts/export-cv-corpus-json.sh

92 lines
2.7 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
CORPUS_DIR="${CV_CORPUS_DIR:-$HOME/cvs}"
TIMESTAMP="$(date +%Y%m%d-%H%M%S)"
EXPORT_ROOT="${CV_JSON_EXPORT_DIR:-$ROOT_DIR/tmp/cv-json-export/$TIMESTAMP}"
OUTPUT_ROOT="$EXPORT_ROOT/benchmark"
APPROVED_DIR="$EXPORT_ROOT/approved-fixtures"
EDITABLE_DIR="$EXPORT_ROOT/editable-json"
DEFAULT_IGNORE_PATTERNS="cv-template.pdf,Resume.en.pdf,EPS-*.pdf"
IGNORE_PATTERNS="${CV_BENCHMARK_IGNORE:-$DEFAULT_IGNORE_PATTERNS}"
resolve_dotnet() {
if command -v dotnet >/dev/null 2>&1; then
command -v dotnet
return 0
fi
local candidates=(
"$HOME/.gsd/agent/bin/dotnet"
"$HOME/.dotnet/dotnet"
"/usr/bin/dotnet"
"/usr/local/bin/dotnet"
)
local candidate
for candidate in "${candidates[@]}"; do
if [[ -x "$candidate" ]]; then
printf '%s\n' "$candidate"
return 0
fi
done
return 1
}
DOTNET_BIN="$(resolve_dotnet || true)"
if [[ ! -d "$CORPUS_DIR" ]]; then
echo "CV corpus directory not found: $CORPUS_DIR" >&2
exit 1
fi
if [[ -z "$DOTNET_BIN" ]]; then
echo "dotnet not found on PATH or known fallback locations." >&2
echo "Checked: ~/.gsd/agent/bin/dotnet, ~/.dotnet/dotnet, /usr/bin/dotnet, /usr/local/bin/dotnet" >&2
exit 1
fi
if [[ "$CORPUS_DIR" != "/home/pi/cvs" ]]; then
echo "This wrapper currently relies on the existing benchmark harness, which scans /home/pi/cvs." >&2
echo "Set up a symlink or move the corpus there, or rerun with CV_CORPUS_DIR=/home/pi/cvs." >&2
exit 1
fi
mkdir -p "$OUTPUT_ROOT" "$APPROVED_DIR" "$EDITABLE_DIR"
echo "Exporting structured CV JSON from: $CORPUS_DIR"
echo "Working directory: $EXPORT_ROOT"
echo "Ignoring files: $IGNORE_PATTERNS"
CV_BENCHMARK_OUTPUT_DIR="$OUTPUT_ROOT" \
CV_BENCHMARK_APPROVED_DIR="$APPROVED_DIR" \
CV_BENCHMARK_IGNORE="$IGNORE_PATTERNS" \
"$DOTNET_BIN" test "$ROOT_DIR/JobTrackerApi.Tests/JobTrackerApi.Tests.csproj" --filter CvCorpusHarnessTests /p:DisableSourceControlManagerQueries=true
if [[ -d "$OUTPUT_ROOT/outputs" ]]; then
find "$OUTPUT_ROOT/outputs" -maxdepth 1 -name '*.json' -print0 | while IFS= read -r -d '' file; do
base="$(basename "$file")"
cp "$file" "$EDITABLE_DIR/$base"
done
fi
cat <<EOF
Done.
Generated files:
- Latest parser output: $OUTPUT_ROOT/outputs
- Editable copies: $EDITABLE_DIR
- Candidate fixtures: $OUTPUT_ROOT/candidate-fixtures
- Summary report: $OUTPUT_ROOT/report.md
- Machine index: $OUTPUT_ROOT/index.json
Recommended workflow:
1. Edit files in: $EDITABLE_DIR
2. Keep only the fields you really want the extractor to produce.
3. Once reviewed, copy the corrected files into an approved fixtures directory.
4. Re-run the benchmark and compare actual vs approved output.
EOF