using System.Text; using System.Text.RegularExpressions; using JobTrackerApi.Services; using JobTrackerApi.Models; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Identity; using Microsoft.AspNetCore.Mvc; namespace JobTrackerApi.Controllers; [ApiController] [Route("api/profile-cv")] [Authorize(AuthenticationSchemes = "local")] public sealed class ProfileCvController : ControllerBase { private static readonly HashSet AllowedExtensions = new(StringComparer.OrdinalIgnoreCase) { ".txt", ".md", ".pdf", ".docx", ".png", ".jpg", ".jpeg", ".webp", }; private const long MaxFileSizeBytes = 5 * 1024 * 1024; private readonly UserManager _users; private readonly ISummarizerService _aiService; public ProfileCvController(UserManager users, ISummarizerService aiService) { _users = users; _aiService = aiService; } [HttpPost("upload")] [RequestSizeLimit(MaxFileSizeBytes)] public async Task Upload([FromForm] IFormFile file) { var user = await _users.GetUserAsync(User); if (user is null) return Unauthorized(); if (file is null || file.Length == 0) return BadRequest("Select a CV file to upload."); if (file.Length > MaxFileSizeBytes) return BadRequest("CV import file is too large. Keep it under 5 MB."); var extension = Path.GetExtension(file.FileName ?? string.Empty); if (!AllowedExtensions.Contains(extension)) { return BadRequest("Only .txt, .md, .pdf, .docx, .png, .jpg, .jpeg, and .webp CV imports are supported right now."); } string text; var canUseAiExtraction = string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".png", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".jpg", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".jpeg", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".webp", StringComparison.OrdinalIgnoreCase); if (canUseAiExtraction) { await using var uploadStream = file.OpenReadStream(); var extracted = await _aiService.ExtractTextAsync(uploadStream, file.FileName ?? $"cv{extension}", file.ContentType, HttpContext.RequestAborted); text = extracted?.Text?.Trim() ?? string.Empty; } else { text = string.Empty; } if (string.IsNullOrWhiteSpace(text)) { text = (await ExtractTextAsync(file, extension)).Trim(); } if (string.IsNullOrWhiteSpace(text)) { return BadRequest("The uploaded CV file could not be read or was empty."); } user.ProfileCvText = text; var result = await _users.UpdateAsync(user); if (!result.Succeeded) { return BadRequest(string.Join("; ", result.Errors.Select(e => e.Description))); } return Ok(new { imported = true, characters = text.Length }); } private static async Task ExtractTextAsync(IFormFile file, string extension) { if (string.Equals(extension, ".txt", StringComparison.OrdinalIgnoreCase) || string.Equals(extension, ".md", StringComparison.OrdinalIgnoreCase)) { using var stream = file.OpenReadStream(); using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true); return (await reader.ReadToEndAsync()).Trim(); } await using var memory = new MemoryStream(); await file.CopyToAsync(memory); var bytes = memory.ToArray(); if (string.Equals(extension, ".pdf", StringComparison.OrdinalIgnoreCase)) { var raw = Encoding.UTF8.GetString(bytes); var textMatches = Regex.Matches(raw, @"\((.*?)\)Tj", RegexOptions.Singleline) .Select(match => match.Groups[1].Value) .Concat(Regex.Matches(raw, @"\[(.*?)\]TJ", RegexOptions.Singleline) .SelectMany(match => Regex.Matches(match.Groups[1].Value, @"\((.*?)\)", RegexOptions.Singleline).Select(x => x.Groups[1].Value))) .Where(value => !string.IsNullOrWhiteSpace(value)) .Select(value => Regex.Unescape(value)) .ToList(); var joined = textMatches.Count > 0 ? string.Join(" ", textMatches) : raw; var scrubbed = Regex.Replace(joined, @"[\x00-\x08\x0B\x0C\x0E-\x1F]", " "); return Regex.Replace(scrubbed, @"\s+", " ").Trim(); } if (string.Equals(extension, ".docx", StringComparison.OrdinalIgnoreCase)) { using var archive = new System.IO.Compression.ZipArchive(new MemoryStream(bytes), System.IO.Compression.ZipArchiveMode.Read, leaveOpen: false); var entry = archive.GetEntry("word/document.xml"); if (entry is null) return string.Empty; using var entryStream = entry.Open(); using var reader = new StreamReader(entryStream, Encoding.UTF8); var xml = await reader.ReadToEndAsync(); var withoutTags = Regex.Replace(xml, "<[^>]+>", " "); var decoded = System.Net.WebUtility.HtmlDecode(withoutTags) ?? string.Empty; return Regex.Replace(decoded, @"\s+", " ").Trim(); } return string.Empty; } }