using BLAIzor.Models; using System.Text; namespace BLAIzor.Helpers { public static class ChunkingHelper { private static readonly string[] SectionDelimiters = { "\n##", "\n###", "\nSection:", "\nTopic:", "\nTitle:", "\n---", "[[CHUNK_BREAK]]", "### Logical Break ###" }; public static List SplitStructuredText(string content, int maxChunkCharLength = 3000) { if (string.IsNullOrWhiteSpace(content)) return new List(); // Normalize newlines content = content.Replace("\r\n", "\n").Trim(); // Step 1: Split by known logical boundaries var logicalParts = SplitByDelimiters(content, SectionDelimiters); // Step 2: Recombine smaller logical parts into full-size chunks var chunks = new List(); var currentChunk = new StringBuilder(); foreach (var part in logicalParts) { if (currentChunk.Length + part.Length + 2 <= maxChunkCharLength) { currentChunk.AppendLine(part); currentChunk.AppendLine(); } else { if (currentChunk.Length > 0) { chunks.Add(currentChunk.ToString().Trim()); currentChunk.Clear(); } if (part.Length > maxChunkCharLength) { chunks.AddRange(SplitLongParagraph(part, maxChunkCharLength)); } else { currentChunk.AppendLine(part); currentChunk.AppendLine(); } } } if (currentChunk.Length > 0) chunks.Add(currentChunk.ToString().Trim()); return chunks; } private static List SplitByDelimiters(string content, string[] delimiters) { var parts = new List(); var remaining = content; foreach (var delimiter in delimiters) { remaining = remaining.Replace(delimiter, "\n[[SECTION_BREAK]]"); } return remaining.Split(new[] { "[[SECTION_BREAK]]" }, StringSplitOptions.RemoveEmptyEntries) .Select(s => s.Trim()) .Where(s => !string.IsNullOrWhiteSpace(s)) .ToList(); } private static List SplitLongParagraph(string text, int maxLength) { var chunks = new List(); int index = 0; while (index < text.Length) { int len = Math.Min(maxLength, text.Length - index); chunks.Add(text.Substring(index, len)); index += len; } return chunks; } /// /// Checks whether the Qdrant-based chunks match the original ContentItem text. /// /// The ContentItem to check. /// The SQL chunks that reference Qdrant IDs. /// All related WebPageContent objects pulled from Qdrant. public static bool IsChunkingConsistent( ContentItem item, List chunks, List vectorData) { if (item == null || string.IsNullOrWhiteSpace(item.Content) || chunks == null || vectorData == null) return false; var orderedChunks = chunks.OrderBy(c => c.ChunkIndex).ToList(); var combinedText = string.Join("", orderedChunks.Select(chunk => vectorData.FirstOrDefault(v => v.UId == chunk.QdrantPointId)?.Content?.Trim() ?? "" )); var original = NormalizeText(item.Content); var reassembled = NormalizeText(combinedText); return original == reassembled; } private static string NormalizeText(string text) { return text.Replace("\r", "") .Replace("\n", "") .Replace(" ", "") .Trim() .ToLowerInvariant(); } } }