SeemGen/Helpers/ChunkingHelper.cs

131 lines
4.4 KiB
C#

using BLAIzor.Models;
using System.Text;
namespace BLAIzor.Helpers
{
public static class ChunkingHelper
{
private static readonly string[] SectionDelimiters =
{
"\n##", "\n###", "\nSection:", "\nTopic:", "\nTitle:", "\n---", "[[CHUNK_BREAK]]", "### Logical Break ###"
};
public static List<string> SplitStructuredText(string content, int maxChunkCharLength = 3000)
{
if (string.IsNullOrWhiteSpace(content))
return new List<string>();
// Normalize newlines
content = content.Replace("\r\n", "\n").Trim();
// Step 1: Split by known logical boundaries
var logicalParts = SplitByDelimiters(content, SectionDelimiters);
// Step 2: Recombine smaller logical parts into full-size chunks
var chunks = new List<string>();
var currentChunk = new StringBuilder();
foreach (var part in logicalParts)
{
if (currentChunk.Length + part.Length + 2 <= maxChunkCharLength)
{
currentChunk.AppendLine(part);
currentChunk.AppendLine();
}
else
{
if (currentChunk.Length > 0)
{
chunks.Add(currentChunk.ToString().Trim());
currentChunk.Clear();
}
if (part.Length > maxChunkCharLength)
{
chunks.AddRange(SplitLongParagraph(part, maxChunkCharLength));
}
else
{
currentChunk.AppendLine(part);
currentChunk.AppendLine();
}
}
}
if (currentChunk.Length > 0)
chunks.Add(currentChunk.ToString().Trim());
return chunks;
}
private static List<string> SplitByDelimiters(string content, string[] delimiters)
{
var parts = new List<string>();
var remaining = content;
foreach (var delimiter in delimiters)
{
remaining = remaining.Replace(delimiter, "\n[[SECTION_BREAK]]");
}
return remaining.Split(new[] { "[[SECTION_BREAK]]" }, StringSplitOptions.RemoveEmptyEntries)
.Select(s => s.Trim())
.Where(s => !string.IsNullOrWhiteSpace(s))
.ToList();
}
private static List<string> SplitLongParagraph(string text, int maxLength)
{
var chunks = new List<string>();
int index = 0;
while (index < text.Length)
{
int len = Math.Min(maxLength, text.Length - index);
chunks.Add(text.Substring(index, len));
index += len;
}
return chunks;
}
/// <summary>
/// Checks whether the Qdrant-based chunks match the original ContentItem text.
/// </summary>
/// <param name="item">The ContentItem to check.</param>
/// <param name="chunks">The SQL chunks that reference Qdrant IDs.</param>
/// <param name="vectorData">All related WebPageContent objects pulled from Qdrant.</param>
public static bool IsChunkingConsistent(
ContentItem item,
List<ContentChunk> chunks,
List<WebPageContent> vectorData)
{
if (item == null || string.IsNullOrWhiteSpace(item.Content) || chunks == null || vectorData == null)
return false;
var orderedChunks = chunks.OrderBy(c => c.ChunkIndex).ToList();
var combinedText = string.Join("",
orderedChunks.Select(chunk =>
vectorData.FirstOrDefault(v => v.UId == chunk.QdrantPointId)?.Content?.Trim() ?? ""
));
var original = NormalizeText(item.Content);
var reassembled = NormalizeText(combinedText);
return original == reassembled;
}
private static string NormalizeText(string text)
{
return text.Replace("\r", "")
.Replace("\n", "")
.Replace(" ", "")
.Trim()
.ToLowerInvariant();
}
}
}