Add canonical CV artifact pipeline

This commit is contained in:
2026-03-28 23:32:54 +01:00
parent d8ab312f59
commit 107c181506
10 changed files with 619 additions and 82 deletions
+235 -54
View File
@@ -1,11 +1,14 @@
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using JobTrackerApi.Data;
using JobTrackerApi.Services;
using JobTrackerApi.Models;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Identity;
using Microsoft.AspNetCore.Mvc;
using Microsoft.EntityFrameworkCore;
namespace JobTrackerApi.Controllers;
@@ -52,19 +55,28 @@ public sealed class ProfileCvController : ControllerBase
};
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
private const string ParserVersion = "m005-s01";
private const string NormalizerVersion = "m005-s01";
private const string LlmPromptVersion = "m005-s01";
private readonly UserManager<ApplicationUser> _users;
private readonly ISummarizerService _aiService;
private readonly JobTrackerContext _db;
private readonly AppPaths _paths;
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
{
_users = users;
_aiService = aiService;
_db = db;
_paths = paths;
}
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
public sealed record ParseCvRequest(string? Text);
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
[HttpPost("upload")]
[RequestSizeLimit(MaxFileSizeBytes)]
public async Task<IActionResult> Upload([FromForm] IFormFile file)
@@ -80,48 +92,113 @@ public sealed class ProfileCvController : ControllerBase
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
}
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
var artifact = await SaveUploadArtifactAsync(user, file, HttpContext.RequestAborted);
_db.CvUploadArtifacts.Add(artifact);
await _db.SaveChangesAsync(HttpContext.RequestAborted);
if (canUseAiExtraction)
var run = new CvExtractionRun
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
text = extracted?.Text?.Trim() ?? string.Empty;
OwnerUserId = user.Id,
ArtifactId = artifact.Id,
Trigger = "upload",
ParserVersion = ParserVersion,
NormalizerVersion = NormalizerVersion,
LlmPromptVersion = LlmPromptVersion,
Status = "running",
StartedAtUtc = DateTimeOffset.UtcNow,
};
_db.CvExtractionRuns.Add(run);
await _db.SaveChangesAsync(HttpContext.RequestAborted);
try
{
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
result.StructuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
result.StructuredCv.Metadata.AppliedExtractionRunId = run.Id;
result.StructuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
var structuredJson = StructuredCvProfileJson.Serialize(result.StructuredCv);
run.RawExtractedText = result.RawText;
run.NormalizedText = result.NormalizedText;
run.StructuredProfileJson = structuredJson;
run.Status = "applied";
run.CompletedAtUtc = DateTimeOffset.UtcNow;
run.AppliedAtUtc = run.CompletedAtUtc;
user.ProfileCvText = result.NormalizedText;
user.ProfileCvStructureJson = structuredJson;
user.CurrentCvUploadArtifactId = artifact.Id;
user.CurrentCvExtractionRunId = run.Id;
user.CurrentCvProfileVersion = result.StructuredCv.Metadata.ProfileVersion;
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
{
run.Status = "failed";
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
await _db.SaveChangesAsync(HttpContext.RequestAborted);
return BadRequest(run.ErrorMessage);
}
await _db.SaveChangesAsync(HttpContext.RequestAborted);
return Ok(new
{
imported = true,
characters = result.NormalizedText.Length,
structuredCv = result.StructuredCv,
sections = result.StructuredCv.Sections,
artifactId = artifact.Id,
extractionRunId = run.Id,
profileVersion = result.StructuredCv.Metadata.ProfileVersion,
});
}
else
catch (Exception ex)
{
text = string.Empty;
run.Status = "failed";
run.ErrorMessage = ex.Message;
run.CompletedAtUtc = DateTimeOffset.UtcNow;
await _db.SaveChangesAsync(HttpContext.RequestAborted);
throw;
}
}
[HttpPost("reprocess")]
public async Task<IActionResult> Reprocess()
{
var user = await _users.GetUserAsync(User);
if (user is null) return Unauthorized();
var artifact = await _db.CvUploadArtifacts
.OrderByDescending(x => x.UploadedAtUtc)
.FirstOrDefaultAsync(x => x.OwnerUserId == user.Id, HttpContext.RequestAborted);
if (artifact is null) return BadRequest("Upload a CV before reprocessing it.");
if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath))
{
return BadRequest("The stored CV artifact could not be found for reprocessing.");
}
if (string.IsNullOrWhiteSpace(text))
await using var stream = System.IO.File.OpenRead(artifact.StoragePath);
var file = new FormFile(stream, 0, stream.Length, "file", artifact.OriginalFileName)
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
Headers = new HeaderDictionary(),
ContentType = artifact.MimeType
};
var extension = Path.GetExtension(artifact.OriginalFileName ?? string.Empty);
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
await ApplyTextExtractionRunAsync(user, "reprocess", result.RawText, result.NormalizedText, result.StructuredCv, artifact.Id, HttpContext.RequestAborted);
return Ok(new
{
return BadRequest("The uploaded CV file could not be read or was empty.");
}
text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim();
var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted);
user.ProfileCvText = text;
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections });
reprocessed = true,
artifactId = artifact.Id,
extractionRunId = user.CurrentCvExtractionRunId,
profileVersion = user.CurrentCvProfileVersion,
structuredCv = result.StructuredCv,
sections = result.StructuredCv.Sections,
});
}
[HttpPost("rebuild")]
@@ -144,14 +221,9 @@ public sealed class ProfileCvController : ControllerBase
user.ProfileCvText = rebuilt.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
await ApplyTextExtractionRunAsync(user, "rebuild", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
[HttpPost("rewrite-section")]
@@ -189,14 +261,13 @@ public sealed class ProfileCvController : ControllerBase
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
if (string.IsNullOrWhiteSpace(request?.Text))
{
return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description)));
user.ProfileCvText = source;
}
await ApplyTextExtractionRunAsync(user, "parse", source, source, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) });
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source), extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
[HttpPost("improve")]
@@ -219,14 +290,9 @@ public sealed class ProfileCvController : ControllerBase
user.ProfileCvText = improved.Trim();
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
var result = await _users.UpdateAsync(user);
if (!result.Succeeded)
{
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
}
await ApplyTextExtractionRunAsync(user, "improve", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
}
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
@@ -252,6 +318,121 @@ public sealed class ProfileCvController : ControllerBase
return StructuredCvProfileJson.Normalize(merged);
}
private async Task<CvUploadArtifact> SaveUploadArtifactAsync(ApplicationUser user, IFormFile file, CancellationToken cancellationToken)
{
var extension = Path.GetExtension(file.FileName ?? string.Empty);
var userRoot = Path.Combine(_paths.CvArtifactsRoot, user.Id);
Directory.CreateDirectory(userRoot);
var storedFileName = $"{DateTimeOffset.UtcNow:yyyyMMddHHmmss}-{Guid.NewGuid():N}{extension}";
var storagePath = Path.Combine(userRoot, storedFileName);
await using (var target = System.IO.File.Create(storagePath))
await using (var source = file.OpenReadStream())
{
await source.CopyToAsync(target, cancellationToken);
}
await using var hashStream = System.IO.File.OpenRead(storagePath);
var shaBytes = await SHA256.HashDataAsync(hashStream, cancellationToken);
return new CvUploadArtifact
{
OwnerUserId = user.Id,
OriginalFileName = file.FileName ?? storedFileName,
StoredFileName = storedFileName,
MimeType = file.ContentType ?? "application/octet-stream",
ByteSize = file.Length,
Sha256 = Convert.ToHexString(shaBytes),
StoragePath = storagePath,
UploadedAtUtc = DateTimeOffset.UtcNow,
};
}
private async Task<ExtractionPipelineResult> ExtractStructuredCvFromFileAsync(IFormFile file, string extension, CancellationToken cancellationToken)
{
string text;
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
if (canUseAiExtraction)
{
await using var uploadStream = file.OpenReadStream();
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, cancellationToken);
text = extracted?.Text?.Trim() ?? string.Empty;
}
else
{
text = string.Empty;
}
if (string.IsNullOrWhiteSpace(text))
{
text = (await ExtractTextAsync(file, extension)).Trim();
}
if (string.IsNullOrWhiteSpace(text))
{
throw new InvalidOperationException("The uploaded CV file could not be read or was empty.");
}
var normalizedText = (await MaybeReconstructStructuredCvAsync(text, cancellationToken)).Trim();
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
return new ExtractionPipelineResult(text, normalizedText, structuredCv);
}
private async Task ApplyTextExtractionRunAsync(ApplicationUser user, string trigger, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken)
{
var run = new CvExtractionRun
{
OwnerUserId = user.Id,
ArtifactId = artifactId,
Trigger = trigger,
ParserVersion = ParserVersion,
NormalizerVersion = NormalizerVersion,
LlmPromptVersion = LlmPromptVersion,
Status = "applied",
RawExtractedText = rawText,
NormalizedText = normalizedText,
StartedAtUtc = DateTimeOffset.UtcNow,
CompletedAtUtc = DateTimeOffset.UtcNow,
AppliedAtUtc = DateTimeOffset.UtcNow,
};
_db.CvExtractionRuns.Add(run);
await _db.SaveChangesAsync(cancellationToken);
structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
structuredCv.Metadata.AppliedExtractionRunId = run.Id;
structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
var structuredJson = StructuredCvProfileJson.Serialize(structuredCv);
run.StructuredProfileJson = structuredJson;
user.ProfileCvText = normalizedText;
user.ProfileCvStructureJson = structuredJson;
user.CurrentCvExtractionRunId = run.Id;
user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion;
if (artifactId.HasValue)
{
user.CurrentCvUploadArtifactId = artifactId.Value;
}
var update = await _users.UpdateAsync(user);
if (!update.Succeeded)
{
run.Status = "failed";
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
await _db.SaveChangesAsync(cancellationToken);
throw new InvalidOperationException(run.ErrorMessage);
}
await _db.SaveChangesAsync(cancellationToken);
}
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
{
var structuredJson = await _aiService.SummarizeSectionAsync(