SeemGen/Helpers/ChunkingHelper.cs

using BLAIzor.Models;
using System.Text;

namespace BLAIzor.Helpers
{
    public static class ChunkingHelper
    {
        private static readonly string[] SectionDelimiters =
        {
        "\n##", "\n###", "\nSection:", "\nTopic:", "\nTitle:", "\n---", "[[CHUNK_BREAK]]", "### Logical Break ###"
    };

        public static List<string> SplitStructuredText(string content, int maxChunkCharLength = 3000)
        {
            if (string.IsNullOrWhiteSpace(content))
                return new List<string>();

            // Normalize newlines
            content = content.Replace("\r\n", "\n").Trim();

            // Step 1: Split by known logical boundaries
            var logicalParts = SplitByDelimiters(content, SectionDelimiters);

            // Step 2: Recombine smaller logical parts into full-size chunks
            var chunks = new List<string>();
            var currentChunk = new StringBuilder();

            foreach (var part in logicalParts)
            {
                if (currentChunk.Length + part.Length + 2 <= maxChunkCharLength)
                {
                    currentChunk.AppendLine(part);
                    currentChunk.AppendLine();
                }
                else
                {
                    if (currentChunk.Length > 0)
                    {
                        chunks.Add(currentChunk.ToString().Trim());
                        currentChunk.Clear();
                    }

                    if (part.Length > maxChunkCharLength)
                    {
                        chunks.AddRange(SplitLongParagraph(part, maxChunkCharLength));
                    }
                    else
                    {
                        currentChunk.AppendLine(part);
                        currentChunk.AppendLine();
                    }
                }
            }

            if (currentChunk.Length > 0)
                chunks.Add(currentChunk.ToString().Trim());

            return chunks;
        }

        private static List<string> SplitByDelimiters(string content, string[] delimiters)
        {
            var parts = new List<string>();
            var remaining = content;

            foreach (var delimiter in delimiters)
            {
                remaining = remaining.Replace(delimiter, "\n[[SECTION_BREAK]]");
            }

            return remaining.Split(new[] { "[[SECTION_BREAK]]" }, StringSplitOptions.RemoveEmptyEntries)
                            .Select(s => s.Trim())
                            .Where(s => !string.IsNullOrWhiteSpace(s))
                            .ToList();
        }

        private static List<string> SplitLongParagraph(string text, int maxLength)
        {
            var chunks = new List<string>();
            int index = 0;
            while (index < text.Length)
            {
                int len = Math.Min(maxLength, text.Length - index);
                chunks.Add(text.Substring(index, len));
                index += len;
            }
            return chunks;
        }

        /// <summary>
        /// Checks whether the Qdrant-based chunks match the original ContentItem text.
        /// </summary>
        /// <param name="item">The ContentItem to check.</param>
        /// <param name="chunks">The SQL chunks that reference Qdrant IDs.</param>
        /// <param name="vectorData">All related WebPageContent objects pulled from Qdrant.</param>
        public static bool IsChunkingConsistent(
            ContentItem item,
            List<ContentChunk> chunks,
            List<WebPageContent> vectorData)
        {
            if (item == null || string.IsNullOrWhiteSpace(item.Content) || chunks == null || vectorData == null)
                return false;

            var orderedChunks = chunks.OrderBy(c => c.ChunkIndex).ToList();

            var combinedText = string.Join("",
                orderedChunks.Select(chunk =>
                    vectorData.FirstOrDefault(v => v.UId == chunk.QdrantPointId)?.Content?.Trim() ?? ""
                ));

            var original = NormalizeText(item.Content);
            var reassembled = NormalizeText(combinedText);

            return original == reassembled;
        }

        private static string NormalizeText(string text)
        {
            return text.Replace("\r", "")
                       .Replace("\n", "")
                       .Replace(" ", "")
                       .Trim()
                       .ToLowerInvariant();
        }


    }


}