131 lines
4.4 KiB
C#
131 lines
4.4 KiB
C#
using BLAIzor.Models;
|
|
using System.Text;
|
|
|
|
namespace BLAIzor.Helpers
|
|
{
|
|
public static class ChunkingHelper
|
|
{
|
|
private static readonly string[] SectionDelimiters =
|
|
{
|
|
"\n##", "\n###", "\nSection:", "\nTopic:", "\nTitle:", "\n---", "[[CHUNK_BREAK]]", "### Logical Break ###"
|
|
};
|
|
|
|
public static List<string> SplitStructuredText(string content, int maxChunkCharLength = 3000)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(content))
|
|
return new List<string>();
|
|
|
|
// Normalize newlines
|
|
content = content.Replace("\r\n", "\n").Trim();
|
|
|
|
// Step 1: Split by known logical boundaries
|
|
var logicalParts = SplitByDelimiters(content, SectionDelimiters);
|
|
|
|
// Step 2: Recombine smaller logical parts into full-size chunks
|
|
var chunks = new List<string>();
|
|
var currentChunk = new StringBuilder();
|
|
|
|
foreach (var part in logicalParts)
|
|
{
|
|
if (currentChunk.Length + part.Length + 2 <= maxChunkCharLength)
|
|
{
|
|
currentChunk.AppendLine(part);
|
|
currentChunk.AppendLine();
|
|
}
|
|
else
|
|
{
|
|
if (currentChunk.Length > 0)
|
|
{
|
|
chunks.Add(currentChunk.ToString().Trim());
|
|
currentChunk.Clear();
|
|
}
|
|
|
|
if (part.Length > maxChunkCharLength)
|
|
{
|
|
chunks.AddRange(SplitLongParagraph(part, maxChunkCharLength));
|
|
}
|
|
else
|
|
{
|
|
currentChunk.AppendLine(part);
|
|
currentChunk.AppendLine();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (currentChunk.Length > 0)
|
|
chunks.Add(currentChunk.ToString().Trim());
|
|
|
|
return chunks;
|
|
}
|
|
|
|
private static List<string> SplitByDelimiters(string content, string[] delimiters)
|
|
{
|
|
var parts = new List<string>();
|
|
var remaining = content;
|
|
|
|
foreach (var delimiter in delimiters)
|
|
{
|
|
remaining = remaining.Replace(delimiter, "\n[[SECTION_BREAK]]");
|
|
}
|
|
|
|
return remaining.Split(new[] { "[[SECTION_BREAK]]" }, StringSplitOptions.RemoveEmptyEntries)
|
|
.Select(s => s.Trim())
|
|
.Where(s => !string.IsNullOrWhiteSpace(s))
|
|
.ToList();
|
|
}
|
|
|
|
private static List<string> SplitLongParagraph(string text, int maxLength)
|
|
{
|
|
var chunks = new List<string>();
|
|
int index = 0;
|
|
while (index < text.Length)
|
|
{
|
|
int len = Math.Min(maxLength, text.Length - index);
|
|
chunks.Add(text.Substring(index, len));
|
|
index += len;
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks whether the Qdrant-based chunks match the original ContentItem text.
|
|
/// </summary>
|
|
/// <param name="item">The ContentItem to check.</param>
|
|
/// <param name="chunks">The SQL chunks that reference Qdrant IDs.</param>
|
|
/// <param name="vectorData">All related WebPageContent objects pulled from Qdrant.</param>
|
|
public static bool IsChunkingConsistent(
|
|
ContentItem item,
|
|
List<ContentChunk> chunks,
|
|
List<WebPageContent> vectorData)
|
|
{
|
|
if (item == null || string.IsNullOrWhiteSpace(item.Content) || chunks == null || vectorData == null)
|
|
return false;
|
|
|
|
var orderedChunks = chunks.OrderBy(c => c.ChunkIndex).ToList();
|
|
|
|
var combinedText = string.Join("",
|
|
orderedChunks.Select(chunk =>
|
|
vectorData.FirstOrDefault(v => v.UId == chunk.QdrantPointId)?.Content?.Trim() ?? ""
|
|
));
|
|
|
|
var original = NormalizeText(item.Content);
|
|
var reassembled = NormalizeText(combinedText);
|
|
|
|
return original == reassembled;
|
|
}
|
|
|
|
private static string NormalizeText(string text)
|
|
{
|
|
return text.Replace("\r", "")
|
|
.Replace("\n", "")
|
|
.Replace(" ", "")
|
|
.Trim()
|
|
.ToLowerInvariant();
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|