Add canonical CV artifact pipeline
This commit is contained in:
@@ -1,11 +1,14 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
using JobTrackerApi.Data;
|
||||
using JobTrackerApi.Services;
|
||||
using JobTrackerApi.Models;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Identity;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
|
||||
namespace JobTrackerApi.Controllers;
|
||||
|
||||
@@ -52,19 +55,28 @@ public sealed class ProfileCvController : ControllerBase
|
||||
};
|
||||
|
||||
private const long MaxFileSizeBytes = 5 * 1024 * 1024;
|
||||
private const string ParserVersion = "m005-s01";
|
||||
private const string NormalizerVersion = "m005-s01";
|
||||
private const string LlmPromptVersion = "m005-s01";
|
||||
|
||||
private readonly UserManager<ApplicationUser> _users;
|
||||
private readonly ISummarizerService _aiService;
|
||||
private readonly JobTrackerContext _db;
|
||||
private readonly AppPaths _paths;
|
||||
|
||||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService)
|
||||
public ProfileCvController(UserManager<ApplicationUser> users, ISummarizerService aiService, JobTrackerContext db, AppPaths paths)
|
||||
{
|
||||
_users = users;
|
||||
_aiService = aiService;
|
||||
_db = db;
|
||||
_paths = paths;
|
||||
}
|
||||
|
||||
public sealed record RewriteSectionRequest(string SectionName, string? Style, string? TargetRole);
|
||||
public sealed record ParseCvRequest(string? Text);
|
||||
|
||||
private sealed record ExtractionPipelineResult(string RawText, string NormalizedText, StructuredCvProfile StructuredCv);
|
||||
|
||||
[HttpPost("upload")]
|
||||
[RequestSizeLimit(MaxFileSizeBytes)]
|
||||
public async Task<IActionResult> Upload([FromForm] IFormFile file)
|
||||
@@ -80,48 +92,113 @@ public sealed class ProfileCvController : ControllerBase
|
||||
return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now.");
|
||||
}
|
||||
|
||||
string text;
|
||||
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
|
||||
var artifact = await SaveUploadArtifactAsync(user, file, HttpContext.RequestAborted);
|
||||
_db.CvUploadArtifacts.Add(artifact);
|
||||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||||
|
||||
if (canUseAiExtraction)
|
||||
var run = new CvExtractionRun
|
||||
{
|
||||
await using var uploadStream = file.OpenReadStream();
|
||||
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted);
|
||||
text = extracted?.Text?.Trim() ?? string.Empty;
|
||||
OwnerUserId = user.Id,
|
||||
ArtifactId = artifact.Id,
|
||||
Trigger = "upload",
|
||||
ParserVersion = ParserVersion,
|
||||
NormalizerVersion = NormalizerVersion,
|
||||
LlmPromptVersion = LlmPromptVersion,
|
||||
Status = "running",
|
||||
StartedAtUtc = DateTimeOffset.UtcNow,
|
||||
};
|
||||
_db.CvExtractionRuns.Add(run);
|
||||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
|
||||
result.StructuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
|
||||
result.StructuredCv.Metadata.AppliedExtractionRunId = run.Id;
|
||||
result.StructuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
|
||||
var structuredJson = StructuredCvProfileJson.Serialize(result.StructuredCv);
|
||||
|
||||
run.RawExtractedText = result.RawText;
|
||||
run.NormalizedText = result.NormalizedText;
|
||||
run.StructuredProfileJson = structuredJson;
|
||||
run.Status = "applied";
|
||||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||||
run.AppliedAtUtc = run.CompletedAtUtc;
|
||||
|
||||
user.ProfileCvText = result.NormalizedText;
|
||||
user.ProfileCvStructureJson = structuredJson;
|
||||
user.CurrentCvUploadArtifactId = artifact.Id;
|
||||
user.CurrentCvExtractionRunId = run.Id;
|
||||
user.CurrentCvProfileVersion = result.StructuredCv.Metadata.ProfileVersion;
|
||||
|
||||
var update = await _users.UpdateAsync(user);
|
||||
if (!update.Succeeded)
|
||||
{
|
||||
run.Status = "failed";
|
||||
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
|
||||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||||
return BadRequest(run.ErrorMessage);
|
||||
}
|
||||
|
||||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||||
|
||||
return Ok(new
|
||||
{
|
||||
imported = true,
|
||||
characters = result.NormalizedText.Length,
|
||||
structuredCv = result.StructuredCv,
|
||||
sections = result.StructuredCv.Sections,
|
||||
artifactId = artifact.Id,
|
||||
extractionRunId = run.Id,
|
||||
profileVersion = result.StructuredCv.Metadata.ProfileVersion,
|
||||
});
|
||||
}
|
||||
else
|
||||
catch (Exception ex)
|
||||
{
|
||||
text = string.Empty;
|
||||
run.Status = "failed";
|
||||
run.ErrorMessage = ex.Message;
|
||||
run.CompletedAtUtc = DateTimeOffset.UtcNow;
|
||||
await _db.SaveChangesAsync(HttpContext.RequestAborted);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
[HttpPost("reprocess")]
|
||||
public async Task<IActionResult> Reprocess()
|
||||
{
|
||||
var user = await _users.GetUserAsync(User);
|
||||
if (user is null) return Unauthorized();
|
||||
|
||||
var artifact = await _db.CvUploadArtifacts
|
||||
.OrderByDescending(x => x.UploadedAtUtc)
|
||||
.FirstOrDefaultAsync(x => x.OwnerUserId == user.Id, HttpContext.RequestAborted);
|
||||
|
||||
if (artifact is null) return BadRequest("Upload a CV before reprocessing it.");
|
||||
if (string.IsNullOrWhiteSpace(artifact.StoragePath) || !System.IO.File.Exists(artifact.StoragePath))
|
||||
{
|
||||
return BadRequest("The stored CV artifact could not be found for reprocessing.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
await using var stream = System.IO.File.OpenRead(artifact.StoragePath);
|
||||
var file = new FormFile(stream, 0, stream.Length, "file", artifact.OriginalFileName)
|
||||
{
|
||||
text = (await ExtractTextAsync(file, extension)).Trim();
|
||||
}
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
Headers = new HeaderDictionary(),
|
||||
ContentType = artifact.MimeType
|
||||
};
|
||||
|
||||
var extension = Path.GetExtension(artifact.OriginalFileName ?? string.Empty);
|
||||
var result = await ExtractStructuredCvFromFileAsync(file, extension, HttpContext.RequestAborted);
|
||||
await ApplyTextExtractionRunAsync(user, "reprocess", result.RawText, result.NormalizedText, result.StructuredCv, artifact.Id, HttpContext.RequestAborted);
|
||||
|
||||
return Ok(new
|
||||
{
|
||||
return BadRequest("The uploaded CV file could not be read or was empty.");
|
||||
}
|
||||
|
||||
text = (await MaybeReconstructStructuredCvAsync(text, HttpContext.RequestAborted)).Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(text, HttpContext.RequestAborted);
|
||||
|
||||
user.ProfileCvText = text;
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
|
||||
return Ok(new { imported = true, characters = text.Length, structuredCv, sections = structuredCv.Sections });
|
||||
reprocessed = true,
|
||||
artifactId = artifact.Id,
|
||||
extractionRunId = user.CurrentCvExtractionRunId,
|
||||
profileVersion = user.CurrentCvProfileVersion,
|
||||
structuredCv = result.StructuredCv,
|
||||
sections = result.StructuredCv.Sections,
|
||||
});
|
||||
}
|
||||
|
||||
[HttpPost("rebuild")]
|
||||
@@ -144,14 +221,9 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
user.ProfileCvText = rebuilt.Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
await ApplyTextExtractionRunAsync(user, "rebuild", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
|
||||
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
return Ok(new { rebuilt = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
|
||||
}
|
||||
|
||||
[HttpPost("rewrite-section")]
|
||||
@@ -189,14 +261,13 @@ public sealed class ProfileCvController : ControllerBase
|
||||
if (string.IsNullOrWhiteSpace(source)) return BadRequest("Add or import CV text before parsing sections.");
|
||||
|
||||
var structuredCv = await BuildStructuredCvAsync(source, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var update = await _users.UpdateAsync(user);
|
||||
if (!update.Succeeded)
|
||||
if (string.IsNullOrWhiteSpace(request?.Text))
|
||||
{
|
||||
return BadRequest(string.Join("; ", update.Errors.Select(e => e.Description)));
|
||||
user.ProfileCvText = source;
|
||||
}
|
||||
await ApplyTextExtractionRunAsync(user, "parse", source, source, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
|
||||
|
||||
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source) });
|
||||
return Ok(new { structuredCv, sections = structuredCv.Sections, totalWords = CountWords(source), extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
|
||||
}
|
||||
|
||||
[HttpPost("improve")]
|
||||
@@ -219,14 +290,9 @@ public sealed class ProfileCvController : ControllerBase
|
||||
|
||||
user.ProfileCvText = improved.Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(user.ProfileCvText, HttpContext.RequestAborted);
|
||||
user.ProfileCvStructureJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
var result = await _users.UpdateAsync(user);
|
||||
if (!result.Succeeded)
|
||||
{
|
||||
return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description)));
|
||||
}
|
||||
await ApplyTextExtractionRunAsync(user, "improve", user.ProfileCvText, user.ProfileCvText, structuredCv, user.CurrentCvUploadArtifactId, HttpContext.RequestAborted);
|
||||
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections });
|
||||
return Ok(new { improved = true, characters = user.ProfileCvText.Length, text = user.ProfileCvText, structuredCv, sections = structuredCv.Sections, extractionRunId = user.CurrentCvExtractionRunId, profileVersion = user.CurrentCvProfileVersion });
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile> BuildStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
@@ -252,6 +318,121 @@ public sealed class ProfileCvController : ControllerBase
|
||||
return StructuredCvProfileJson.Normalize(merged);
|
||||
}
|
||||
|
||||
private async Task<CvUploadArtifact> SaveUploadArtifactAsync(ApplicationUser user, IFormFile file, CancellationToken cancellationToken)
|
||||
{
|
||||
var extension = Path.GetExtension(file.FileName ?? string.Empty);
|
||||
var userRoot = Path.Combine(_paths.CvArtifactsRoot, user.Id);
|
||||
Directory.CreateDirectory(userRoot);
|
||||
|
||||
var storedFileName = $"{DateTimeOffset.UtcNow:yyyyMMddHHmmss}-{Guid.NewGuid():N}{extension}";
|
||||
var storagePath = Path.Combine(userRoot, storedFileName);
|
||||
|
||||
await using (var target = System.IO.File.Create(storagePath))
|
||||
await using (var source = file.OpenReadStream())
|
||||
{
|
||||
await source.CopyToAsync(target, cancellationToken);
|
||||
}
|
||||
|
||||
await using var hashStream = System.IO.File.OpenRead(storagePath);
|
||||
var shaBytes = await SHA256.HashDataAsync(hashStream, cancellationToken);
|
||||
|
||||
return new CvUploadArtifact
|
||||
{
|
||||
OwnerUserId = user.Id,
|
||||
OriginalFileName = file.FileName ?? storedFileName,
|
||||
StoredFileName = storedFileName,
|
||||
MimeType = file.ContentType ?? "application/octet-stream",
|
||||
ByteSize = file.Length,
|
||||
Sha256 = Convert.ToHexString(shaBytes),
|
||||
StoragePath = storagePath,
|
||||
UploadedAtUtc = DateTimeOffset.UtcNow,
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ExtractionPipelineResult> ExtractStructuredCvFromFileAsync(IFormFile file, string extension, CancellationToken cancellationToken)
|
||||
{
|
||||
string text;
|
||||
var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
if (canUseAiExtraction)
|
||||
{
|
||||
await using var uploadStream = file.OpenReadStream();
|
||||
var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, cancellationToken);
|
||||
text = extracted?.Text?.Trim() ?? string.Empty;
|
||||
}
|
||||
else
|
||||
{
|
||||
text = string.Empty;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
text = (await ExtractTextAsync(file, extension)).Trim();
|
||||
}
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
throw new InvalidOperationException("The uploaded CV file could not be read or was empty.");
|
||||
}
|
||||
|
||||
var normalizedText = (await MaybeReconstructStructuredCvAsync(text, cancellationToken)).Trim();
|
||||
var structuredCv = await BuildStructuredCvAsync(normalizedText, cancellationToken);
|
||||
return new ExtractionPipelineResult(text, normalizedText, structuredCv);
|
||||
}
|
||||
|
||||
private async Task ApplyTextExtractionRunAsync(ApplicationUser user, string trigger, string rawText, string normalizedText, StructuredCvProfile structuredCv, int? artifactId, CancellationToken cancellationToken)
|
||||
{
|
||||
var run = new CvExtractionRun
|
||||
{
|
||||
OwnerUserId = user.Id,
|
||||
ArtifactId = artifactId,
|
||||
Trigger = trigger,
|
||||
ParserVersion = ParserVersion,
|
||||
NormalizerVersion = NormalizerVersion,
|
||||
LlmPromptVersion = LlmPromptVersion,
|
||||
Status = "applied",
|
||||
RawExtractedText = rawText,
|
||||
NormalizedText = normalizedText,
|
||||
StartedAtUtc = DateTimeOffset.UtcNow,
|
||||
CompletedAtUtc = DateTimeOffset.UtcNow,
|
||||
AppliedAtUtc = DateTimeOffset.UtcNow,
|
||||
};
|
||||
_db.CvExtractionRuns.Add(run);
|
||||
await _db.SaveChangesAsync(cancellationToken);
|
||||
|
||||
structuredCv.Metadata.ProfileVersion = (user.CurrentCvProfileVersion ?? 0) + 1;
|
||||
structuredCv.Metadata.AppliedExtractionRunId = run.Id;
|
||||
structuredCv.Metadata.UpdatedAtUtc = DateTimeOffset.UtcNow;
|
||||
var structuredJson = StructuredCvProfileJson.Serialize(structuredCv);
|
||||
run.StructuredProfileJson = structuredJson;
|
||||
|
||||
user.ProfileCvText = normalizedText;
|
||||
user.ProfileCvStructureJson = structuredJson;
|
||||
user.CurrentCvExtractionRunId = run.Id;
|
||||
user.CurrentCvProfileVersion = structuredCv.Metadata.ProfileVersion;
|
||||
if (artifactId.HasValue)
|
||||
{
|
||||
user.CurrentCvUploadArtifactId = artifactId.Value;
|
||||
}
|
||||
|
||||
var update = await _users.UpdateAsync(user);
|
||||
if (!update.Succeeded)
|
||||
{
|
||||
run.Status = "failed";
|
||||
run.ErrorMessage = string.Join("; ", update.Errors.Select(e => e.Description));
|
||||
await _db.SaveChangesAsync(cancellationToken);
|
||||
throw new InvalidOperationException(run.ErrorMessage);
|
||||
}
|
||||
|
||||
await _db.SaveChangesAsync(cancellationToken);
|
||||
}
|
||||
|
||||
private async Task<StructuredCvProfile?> TryExtractStructuredCvAsync(string text, CancellationToken cancellationToken)
|
||||
{
|
||||
var structuredJson = await _aiService.SummarizeSectionAsync(
|
||||
|
||||
Reference in New Issue
Block a user