Source: zpt/transform/ContentChunker.js

/**
 * Intelligent content chunking with semantic boundaries and size limits
 */
export default class ContentChunker {
    constructor(options = {}) {
        this.config = {
            defaultChunkSize: options.defaultChunkSize || 1000,
            maxChunkSize: options.maxChunkSize || 4000,
            minChunkSize: options.minChunkSize || 100,
            overlapSize: options.overlapSize || 100,
            preserveStructure: options.preserveStructure !== false,
            semanticBoundaries: options.semanticBoundaries !== false,
            balanceChunks: options.balanceChunks !== false,
            ...options
        };

        this.initializeChunkingStrategies();
        this.initializeBoundaryDetectors();
        this.chunkingStats = {
            totalChunks: 0,
            avgChunkSize: 0,
            chunksCreated: 0
        };
    }

    /**
     * Initialize different chunking strategies
     */
    initializeChunkingStrategies() {
        this.strategies = {
            fixed: {
                name: 'Fixed Size Chunking',
                handler: this.fixedSizeChunking.bind(this),
                description: 'Split content into fixed-size chunks',
                preservesBoundaries: false,
                efficiency: 'high'
            },
            semantic: {
                name: 'Semantic Boundary Chunking',
                handler: this.semanticChunking.bind(this),
                description: 'Split content at semantic boundaries',
                preservesBoundaries: true,
                efficiency: 'medium'
            },
            adaptive: {
                name: 'Adaptive Size Chunking',
                handler: this.adaptiveChunking.bind(this),
                description: 'Dynamically adjust chunk sizes based on content',
                preservesBoundaries: true,
                efficiency: 'low'
            },
            hierarchical: {
                name: 'Hierarchical Structure Chunking',
                handler: this.hierarchicalChunking.bind(this),
                description: 'Respect document structure hierarchy',
                preservesBoundaries: true,
                efficiency: 'medium'
            },
            token_aware: {
                name: 'Token-Aware Chunking',
                handler: this.tokenAwareChunking.bind(this),
                description: 'Chunk based on actual token counts',
                preservesBoundaries: true,
                efficiency: 'low'
            }
        };
    }

    /**
     * Initialize boundary detection patterns
     */
    initializeBoundaryDetectors() {
        this.boundaryDetectors = {
            paragraph: {
                pattern: /\n\s*\n/,
                priority: 5,
                description: 'Paragraph breaks'
            },
            sentence: {
                pattern: /[.!?]+\s+(?=[A-Z])/,
                priority: 3,
                description: 'Sentence boundaries'
            },
            section: {
                pattern: /\n#+\s+/,
                priority: 8,
                description: 'Markdown section headers'
            },
            list: {
                pattern: /\n\s*[-*+]\s+/,
                priority: 4,
                description: 'List items'
            },
            code: {
                pattern: /```[\s\S]*?```/,
                priority: 9,
                description: 'Code blocks'
            },
            quote: {
                pattern: /\n>\s+/,
                priority: 6,
                description: 'Quote blocks'
            }
        };
    }

    /**
     * Main chunking method - splits content based on strategy and constraints
     * @param {string|Array} content - Content to chunk (string or array of corpuscles)
     * @param {Object} options - Chunking options
     * @returns {Promise<Object>} Chunking result
     */
    async chunk(content, options = {}) {
        const startTime = Date.now();
        const opts = { ...this.config, ...options };

        try {
            // Normalize input
            const normalizedContent = this.normalizeContent(content);
            
            // Select chunking strategy
            const strategy = opts.strategy || this.selectOptimalStrategy(normalizedContent, opts);
            const strategyHandler = this.strategies[strategy];
            
            if (!strategyHandler) {
                throw new Error(`Unknown chunking strategy: ${strategy}`);
            }

            // Perform chunking
            const chunks = await strategyHandler.handler(normalizedContent, opts);
            
            // Post-process chunks
            const processedChunks = this.postProcessChunks(chunks, opts);
            
            // Validate chunks
            this.validateChunks(processedChunks, opts);
            
            // Update statistics
            this.updateChunkingStats(processedChunks);
            
            const result = {
                chunks: processedChunks,
                metadata: {
                    strategy,
                    totalChunks: processedChunks.length,
                    originalSize: this.getContentSize(normalizedContent),
                    avgChunkSize: this.calculateAverageChunkSize(processedChunks),
                    processingTime: Date.now() - startTime,
                    options: opts
                },
                statistics: this.getChunkingStatistics(processedChunks)
            };

            return result;

        } catch (error) {
            throw new Error(`Chunking failed: ${error.message}`);
        }
    }

    /**
     * Normalize content input to a standard format
     */
    normalizeContent(content) {
        if (typeof content === 'string') {
            return {
                type: 'text',
                content: content,
                metadata: {}
            };
        }

        if (Array.isArray(content)) {
            return {
                type: 'corpuscles',
                content: content,
                metadata: {
                    count: content.length,
                    types: [...new Set(content.map(c => c.type))]
                }
            };
        }

        if (content && typeof content === 'object') {
            return content;
        }

        throw new Error('Invalid content type for chunking');
    }

    /**
     * Select optimal chunking strategy based on content analysis
     */
    selectOptimalStrategy(normalizedContent, options) {
        const { type, content } = normalizedContent;
        
        // Token-aware strategy for precise token management
        if (options.tokenCounter && options.maxTokens) {
            return 'token_aware';
        }
        
        // Hierarchical for structured content
        if (type === 'corpuscles' || this.hasStructure(content)) {
            return 'hierarchical';
        }
        
        // Semantic for text content when boundaries matter
        if (this.config.semanticBoundaries && typeof content === 'string') {
            return 'semantic';
        }
        
        // Adaptive for variable content sizes
        if (options.balanceChunks) {
            return 'adaptive';
        }
        
        // Default to fixed size for efficiency
        return 'fixed';
    }

    /**
     * Fixed size chunking strategy
     */
    async fixedSizeChunking(normalizedContent, options) {
        const { content } = normalizedContent;
        const chunkSize = options.chunkSize || this.config.defaultChunkSize;
        const overlap = options.overlapSize || this.config.overlapSize;
        
        if (typeof content !== 'string') {
            throw new Error('Fixed size chunking requires string content');
        }

        const chunks = [];
        let position = 0;
        let chunkId = 0;

        while (position < content.length) {
            const endPosition = Math.min(position + chunkSize, content.length);
            const chunkContent = content.slice(position, endPosition);
            
            chunks.push({
                id: `chunk_${chunkId++}`,
                content: chunkContent,
                position: { start: position, end: endPosition },
                size: chunkContent.length,
                type: 'fixed',
                metadata: {
                    strategy: 'fixed',
                    overlap: position > 0 ? overlap : 0
                }
            });
            
            position = Math.max(position + chunkSize - overlap, position + 1);
        }

        return chunks;
    }

    /**
     * Semantic boundary chunking strategy
     */
    async semanticChunking(normalizedContent, options) {
        const { content } = normalizedContent;
        
        if (typeof content !== 'string') {
            throw new Error('Semantic chunking requires string content');
        }

        // Find semantic boundaries
        const boundaries = this.findSemanticBoundaries(content);
        const targetSize = options.chunkSize || this.config.defaultChunkSize;
        const chunks = [];
        
        let currentChunk = '';
        let chunkStart = 0;
        let chunkId = 0;

        for (const boundary of boundaries) {
            const segment = content.slice(chunkStart, boundary.position);
            
            if (currentChunk.length + segment.length <= targetSize || currentChunk.length === 0) {
                // Add to current chunk
                currentChunk += segment;
            } else {
                // Finalize current chunk and start new one
                if (currentChunk.length > 0) {
                    chunks.push(this.createSemanticChunk(
                        currentChunk, 
                        chunkStart, 
                        chunkStart + currentChunk.length, 
                        chunkId++,
                        boundary
                    ));
                }
                
                currentChunk = segment;
                chunkStart = boundary.position - segment.length;
            }
        }

        // Handle remaining content
        if (currentChunk.length > 0) {
            chunks.push(this.createSemanticChunk(
                currentChunk,
                chunkStart,
                chunkStart + currentChunk.length,
                chunkId++
            ));
        }

        return chunks;
    }

    /**
     * Adaptive chunking strategy
     */
    async adaptiveChunking(normalizedContent, options) {
        const { content } = normalizedContent;
        const minSize = options.minChunkSize || this.config.minChunkSize;
        const maxSize = options.maxChunkSize || this.config.maxChunkSize;
        
        if (typeof content !== 'string') {
            throw new Error('Adaptive chunking requires string content');
        }

        const boundaries = this.findSemanticBoundaries(content);
        const chunks = [];
        let chunkId = 0;
        
        // Group boundaries into optimal chunks
        let currentGroup = [];
        let currentSize = 0;

        for (const boundary of boundaries) {
            const segmentSize = boundary.segmentSize || 50; // Estimate
            
            if (currentSize + segmentSize > maxSize && currentGroup.length > 0) {
                // Create chunk from current group
                const chunk = await this.createAdaptiveChunk(content, currentGroup, chunkId++);
                chunks.push(chunk);
                
                currentGroup = [boundary];
                currentSize = segmentSize;
            } else {
                currentGroup.push(boundary);
                currentSize += segmentSize;
            }
        }

        // Handle remaining group
        if (currentGroup.length > 0) {
            const chunk = await this.createAdaptiveChunk(content, currentGroup, chunkId++);
            chunks.push(chunk);
        }

        return chunks;
    }

    /**
     * Hierarchical chunking strategy
     */
    async hierarchicalChunking(normalizedContent, options) {
        const { type, content } = normalizedContent;
        
        if (type === 'corpuscles') {
            return this.chunkCorpuscles(content, options);
        } else {
            return await this.chunkHierarchicalText(content, options);
        }
    }

    /**
     * Token-aware chunking strategy
     */
    async tokenAwareChunking(normalizedContent, options) {
        const { content } = normalizedContent;
        const tokenCounter = options.tokenCounter;
        const maxTokens = options.maxTokens || 1000;
        
        if (!tokenCounter) {
            throw new Error('Token-aware chunking requires tokenCounter');
        }

        if (typeof content !== 'string') {
            throw new Error('Token-aware chunking requires string content');
        }

        const chunks = [];
        const boundaries = this.findSemanticBoundaries(content);
        let currentChunk = '';
        let chunkStart = 0;
        let chunkId = 0;

        for (const boundary of boundaries) {
            const segment = content.slice(chunkStart, boundary.position);
            const testChunk = currentChunk + segment;
            
            // Count tokens in test chunk
            const tokenCount = await tokenCounter.countTokens(testChunk, options.tokenizer);
            
            if (tokenCount.count <= maxTokens || currentChunk.length === 0) {
                currentChunk = testChunk;
            } else {
                // Current chunk exceeds token limit, finalize it
                if (currentChunk.length > 0) {
                    const chunkTokens = await tokenCounter.countTokens(currentChunk, options.tokenizer);
                    chunks.push({
                        id: `chunk_${chunkId++}`,
                        content: currentChunk,
                        position: { start: chunkStart, end: chunkStart + currentChunk.length },
                        size: currentChunk.length,
                        tokenCount: chunkTokens.count,
                        type: 'token_aware',
                        metadata: {
                            strategy: 'token_aware',
                            tokenizer: options.tokenizer,
                            boundary: boundary.type
                        }
                    });
                }
                
                currentChunk = segment;
                chunkStart = boundary.position - segment.length;
            }
        }

        // Handle remaining content
        if (currentChunk.length > 0) {
            const chunkTokens = await tokenCounter.countTokens(currentChunk, options.tokenizer);
            chunks.push({
                id: `chunk_${chunkId++}`,
                content: currentChunk,
                position: { start: chunkStart, end: chunkStart + currentChunk.length },
                size: currentChunk.length,
                tokenCount: chunkTokens.count,
                type: 'token_aware',
                metadata: {
                    strategy: 'token_aware',
                    tokenizer: options.tokenizer
                }
            });
        }

        return chunks;
    }

    /**
     * Find semantic boundaries in text
     */
    findSemanticBoundaries(text) {
        const boundaries = [];
        
        // Find all boundary types
        for (const [type, detector] of Object.entries(this.boundaryDetectors)) {
            const matches = [...text.matchAll(new RegExp(detector.pattern, 'g'))];
            
            matches.forEach(match => {
                boundaries.push({
                    position: match.index,
                    type,
                    priority: detector.priority,
                    length: match[0].length,
                    segmentSize: this.estimateSegmentSize(text, match.index)
                });
            });
        }

        // Sort by position, then by priority
        boundaries.sort((a, b) => {
            if (a.position !== b.position) {
                return a.position - b.position;
            }
            return b.priority - a.priority; // Higher priority first
        });

        // Remove duplicates at same position (keep highest priority)
        const uniqueBoundaries = [];
        let lastPosition = -1;
        
        for (const boundary of boundaries) {
            if (boundary.position !== lastPosition) {
                uniqueBoundaries.push(boundary);
                lastPosition = boundary.position;
            }
        }

        return uniqueBoundaries;
    }

    /**
     * Estimate segment size between boundaries
     */
    estimateSegmentSize(text, position) {
        const nextBoundary = this.findNextBoundary(text, position);
        return nextBoundary ? nextBoundary - position : text.length - position;
    }

    /**
     * Find next boundary position
     */
    findNextBoundary(text, currentPosition) {
        for (const detector of Object.values(this.boundaryDetectors)) {
            const regex = new RegExp(detector.pattern, 'g');
            regex.lastIndex = currentPosition + 1;
            const match = regex.exec(text);
            if (match) {
                return match.index;
            }
        }
        return null;
    }

    /**
     * Create semantic chunk object
     */
    createSemanticChunk(content, start, end, id, boundary = null) {
        return {
            id: `chunk_${id}`,
            content: content.trim(),
            position: { start, end },
            size: content.length,
            type: 'semantic',
            metadata: {
                strategy: 'semantic',
                boundary: boundary ? boundary.type : 'end',
                boundaryPriority: boundary ? boundary.priority : 0
            }
        };
    }

    /**
     * Create adaptive chunk from boundary group
     */
    async createAdaptiveChunk(text, boundaryGroup, id) {
        const start = boundaryGroup[0].position;
        const end = boundaryGroup[boundaryGroup.length - 1].position;
        const content = text.slice(start, end);
        
        return {
            id: `chunk_${id}`,
            content: content.trim(),
            position: { start, end },
            size: content.length,
            type: 'adaptive',
            metadata: {
                strategy: 'adaptive',
                boundaries: boundaryGroup.map(b => b.type),
                avgBoundaryPriority: boundaryGroup.reduce((sum, b) => sum + b.priority, 0) / boundaryGroup.length
            }
        };
    }

    /**
     * Chunk corpuscles hierarchically
     */
    chunkCorpuscles(corpuscles, options) {
        const targetSize = options.chunkSize || this.config.defaultChunkSize;
        const chunks = [];
        let currentChunk = [];
        let currentSize = 0;
        let chunkId = 0;

        // Group corpuscles by type for better organization
        const groupedCorpuscles = this.groupCorpusclesByType(corpuscles);
        
        for (const [type, typeCorpuscles] of groupedCorpuscles) {
            for (const corpuscle of typeCorpuscles) {
                const corpuscleSize = this.estimateCorpuscleSize(corpuscle);
                
                if (currentSize + corpuscleSize > targetSize && currentChunk.length > 0) {
                    // Finalize current chunk
                    chunks.push({
                        id: `chunk_${chunkId++}`,
                        content: currentChunk,
                        size: currentSize,
                        type: 'hierarchical_corpuscles',
                        metadata: {
                            strategy: 'hierarchical',
                            corpuscleCount: currentChunk.length,
                            dominantType: this.getDominantType(currentChunk)
                        }
                    });
                    
                    currentChunk = [];
                    currentSize = 0;
                }
                
                currentChunk.push(corpuscle);
                currentSize += corpuscleSize;
            }
        }

        // Handle remaining corpuscles
        if (currentChunk.length > 0) {
            chunks.push({
                id: `chunk_${chunkId++}`,
                content: currentChunk,
                size: currentSize,
                type: 'hierarchical_corpuscles',
                metadata: {
                    strategy: 'hierarchical',
                    corpuscleCount: currentChunk.length,
                    dominantType: this.getDominantType(currentChunk)
                }
            });
        }

        return chunks;
    }

    /**
     * Chunk hierarchical text based on structure
     */
    async chunkHierarchicalText(text, options) {
        // Find structural elements (headers, sections, etc.)
        const structure = this.analyzeTextStructure(text);
        const chunks = [];
        let chunkId = 0;

        for (const section of structure.sections) {
            const sectionContent = text.slice(section.start, section.end);
            const sectionSize = sectionContent.length;
            const maxSize = options.maxChunkSize || this.config.maxChunkSize;

            if (sectionSize <= maxSize) {
                // Section fits in one chunk
                chunks.push({
                    id: `chunk_${chunkId++}`,
                    content: sectionContent.trim(),
                    position: { start: section.start, end: section.end },
                    size: sectionSize,
                    type: 'hierarchical_section',
                    metadata: {
                        strategy: 'hierarchical',
                        sectionType: section.type,
                        level: section.level,
                        title: section.title
                    }
                });
            } else {
                // Split large section into sub-chunks
                const subChunks = await this.splitLargeSection(sectionContent, section, maxSize, chunkId);
                chunks.push(...subChunks);
                chunkId += subChunks.length;
            }
        }

        return chunks;
    }

    /**
     * Analyze text structure for hierarchical chunking
     */
    analyzeTextStructure(text) {
        const sections = [];
        const headerPattern = /^(#{1,6})\s+(.+)$/gm;
        let match;
        let lastPosition = 0;

        while ((match = headerPattern.exec(text)) !== null) {
            // Close previous section if exists
            if (sections.length > 0) {
                sections[sections.length - 1].end = match.index;
            }

            // Start new section
            sections.push({
                start: match.index,
                end: text.length, // Will be updated
                level: match[1].length,
                title: match[2].trim(),
                type: 'header'
            });
        }

        // If no headers found, treat as single section
        if (sections.length === 0) {
            sections.push({
                start: 0,
                end: text.length,
                level: 1,
                title: 'Content',
                type: 'content'
            });
        }

        return { sections };
    }

    /**
     * Split large section into sub-chunks
     */
    async splitLargeSection(sectionContent, section, maxSize, startId) {
        // Use semantic chunking for large sections
        const semanticChunks = await this.semanticChunking(
            { type: 'text', content: sectionContent },
            { chunkSize: maxSize }
        );

        return semanticChunks.map((chunk, index) => ({
            ...chunk,
            id: `chunk_${startId + index}`,
            type: 'hierarchical_subsection',
            metadata: {
                ...chunk.metadata,
                parentSection: section.title,
                parentLevel: section.level,
                subsectionIndex: index
            }
        }));
    }

    /**
     * Post-process chunks for optimization
     */
    postProcessChunks(chunks, options) {
        let processed = [...chunks];

        // Balance chunk sizes if requested
        if (options.balanceChunks) {
            processed = this.balanceChunkSizes(processed, options);
        }

        // Add overlap if specified
        if (options.addOverlap && options.overlapSize > 0) {
            processed = this.addChunkOverlap(processed, options);
        }

        // Merge small chunks if beneficial
        if (options.mergeSmallChunks) {
            processed = this.mergeSmallChunks(processed, options);
        }

        // Add chunk relationships
        processed = this.addChunkRelationships(processed);

        return processed;
    }

    /**
     * Balance chunk sizes for more uniform distribution
     */
    balanceChunkSizes(chunks, options) {
        const targetSize = options.chunkSize || this.config.defaultChunkSize;
        const tolerance = 0.2; // 20% tolerance
        const balanced = [];

        for (const chunk of chunks) {
            if (chunk.size > targetSize * (1 + tolerance)) {
                // Split large chunk
                const subChunks = this.splitChunk(chunk, targetSize);
                balanced.push(...subChunks);
            } else if (chunk.size < targetSize * (1 - tolerance)) {
                // Mark for potential merging
                chunk.needsMerging = true;
                balanced.push(chunk);
            } else {
                balanced.push(chunk);
            }
        }

        return balanced;
    }

    /**
     * Add overlap between consecutive chunks
     */
    addChunkOverlap(chunks, options) {
        const overlapSize = options.overlapSize;
        const overlapped = [];

        for (let i = 0; i < chunks.length; i++) {
            const chunk = { ...chunks[i] };
            
            if (i > 0 && typeof chunk.content === 'string') {
                // Add overlap from previous chunk
                const prevChunk = chunks[i - 1];
                if (typeof prevChunk.content === 'string') {
                    const overlap = prevChunk.content.slice(-overlapSize);
                    chunk.content = overlap + chunk.content;
                    chunk.metadata = {
                        ...chunk.metadata,
                        hasOverlap: true,
                        overlapSize: overlap.length
                    };
                }
            }
            
            overlapped.push(chunk);
        }

        return overlapped;
    }

    /**
     * Merge small consecutive chunks
     */
    mergeSmallChunks(chunks, options) {
        const minSize = options.minChunkSize || this.config.minChunkSize;
        const merged = [];
        let currentMergeGroup = [];

        for (const chunk of chunks) {
            if (chunk.needsMerging || chunk.size < minSize) {
                currentMergeGroup.push(chunk);
            } else {
                // Finalize merge group if any
                if (currentMergeGroup.length > 0) {
                    if (currentMergeGroup.length === 1) {
                        merged.push(currentMergeGroup[0]);
                    } else {
                        merged.push(this.mergeChunkGroup(currentMergeGroup));
                    }
                    currentMergeGroup = [];
                }
                merged.push(chunk);
            }
        }

        // Handle remaining merge group
        if (currentMergeGroup.length > 0) {
            if (currentMergeGroup.length === 1) {
                merged.push(currentMergeGroup[0]);
            } else {
                merged.push(this.mergeChunkGroup(currentMergeGroup));
            }
        }

        return merged;
    }

    /**
     * Add relationships between chunks
     */
    addChunkRelationships(chunks) {
        return chunks.map((chunk, index) => ({
            ...chunk,
            relationships: {
                previous: index > 0 ? chunks[index - 1].id : null,
                next: index < chunks.length - 1 ? chunks[index + 1].id : null,
                sequence: index,
                total: chunks.length
            }
        }));
    }

    /**
     * Utility methods
     */
    hasStructure(content) {
        if (typeof content !== 'string') return false;
        
        // Check for structural elements
        const structurePatterns = [
            /^#{1,6}\s+/m, // Headers
            /^\s*[-*+]\s+/m, // Lists
            /```[\s\S]*?```/, // Code blocks
            /^\s*\d+\.\s+/m // Numbered lists
        ];
        
        return structurePatterns.some(pattern => pattern.test(content));
    }

    getContentSize(normalizedContent) {
        const { type, content } = normalizedContent;
        
        if (type === 'text') {
            return content.length;
        } else if (type === 'corpuscles') {
            return content.reduce((sum, c) => sum + this.estimateCorpuscleSize(c), 0);
        }
        
        return 0;
    }

    estimateCorpuscleSize(corpuscle) {
        // Estimate size based on content fields
        const content = corpuscle.content || {};
        return Object.values(content).join(' ').length;
    }

    groupCorpusclesByType(corpuscles) {
        const groups = new Map();
        
        corpuscles.forEach(corpuscle => {
            const type = corpuscle.type || 'unknown';
            if (!groups.has(type)) {
                groups.set(type, []);
            }
            groups.get(type).push(corpuscle);
        });
        
        return groups;
    }

    getDominantType(corpuscles) {
        const typeCounts = new Map();
        
        corpuscles.forEach(corpuscle => {
            const type = corpuscle.type || 'unknown';
            typeCounts.set(type, (typeCounts.get(type) || 0) + 1);
        });
        
        let dominantType = 'unknown';
        let maxCount = 0;
        
        for (const [type, count] of typeCounts) {
            if (count > maxCount) {
                maxCount = count;
                dominantType = type;
            }
        }
        
        return dominantType;
    }

    splitChunk(chunk, targetSize) {
        // Simple splitting - could be enhanced with semantic boundaries
        if (typeof chunk.content !== 'string') {
            return [chunk]; // Can't split non-string content easily
        }

        const subChunks = [];
        const content = chunk.content;
        let position = 0;
        let subId = 0;

        while (position < content.length) {
            const endPos = Math.min(position + targetSize, content.length);
            const subContent = content.slice(position, endPos);
            
            subChunks.push({
                id: `${chunk.id}_sub_${subId++}`,
                content: subContent,
                position: { start: position, end: endPos },
                size: subContent.length,
                type: `${chunk.type}_split`,
                metadata: {
                    ...chunk.metadata,
                    parentChunk: chunk.id,
                    splitIndex: subId - 1
                }
            });
            
            position = endPos;
        }

        return subChunks;
    }

    mergeChunkGroup(chunkGroup) {
        const mergedContent = chunkGroup.map(c => 
            typeof c.content === 'string' ? c.content : JSON.stringify(c.content)
        ).join('\n\n');
        
        const totalSize = chunkGroup.reduce((sum, c) => sum + c.size, 0);
        
        return {
            id: `merged_${chunkGroup.map(c => c.id).join('_')}`,
            content: mergedContent,
            size: totalSize,
            type: 'merged',
            metadata: {
                strategy: 'merged',
                originalChunks: chunkGroup.map(c => c.id),
                mergedCount: chunkGroup.length
            }
        };
    }

    calculateAverageChunkSize(chunks) {
        if (chunks.length === 0) return 0;
        return chunks.reduce((sum, chunk) => sum + chunk.size, 0) / chunks.length;
    }

    validateChunks(chunks, options) {
        const maxSize = options.maxChunkSize || this.config.maxChunkSize;
        const minSize = options.minChunkSize || this.config.minChunkSize;
        
        for (const chunk of chunks) {
            if (chunk.size > maxSize) {
                console.warn(`Chunk ${chunk.id} exceeds maximum size: ${chunk.size} > ${maxSize}`);
            }
            
            if (chunk.size < minSize && chunks.length > 1) {
                console.warn(`Chunk ${chunk.id} below minimum size: ${chunk.size} < ${minSize}`);
            }
        }
    }

    updateChunkingStats(chunks) {
        this.chunkingStats.totalChunks += chunks.length;
        this.chunkingStats.chunksCreated += chunks.length;
        
        const totalSize = chunks.reduce((sum, chunk) => sum + chunk.size, 0);
        this.chunkingStats.avgChunkSize = totalSize / chunks.length;
    }

    getChunkingStatistics(chunks) {
        const sizes = chunks.map(c => c.size);
        const types = chunks.map(c => c.type);
        
        return {
            count: chunks.length,
            totalSize: sizes.reduce((a, b) => a + b, 0),
            avgSize: sizes.reduce((a, b) => a + b, 0) / sizes.length,
            minSize: Math.min(...sizes),
            maxSize: Math.max(...sizes),
            sizeVariance: this.calculateVariance(sizes),
            typeDistribution: this.getTypeDistribution(types)
        };
    }

    calculateVariance(numbers) {
        const mean = numbers.reduce((a, b) => a + b, 0) / numbers.length;
        const squaredDiffs = numbers.map(n => Math.pow(n - mean, 2));
        return squaredDiffs.reduce((a, b) => a + b, 0) / numbers.length;
    }

    getTypeDistribution(types) {
        const distribution = new Map();
        types.forEach(type => {
            distribution.set(type, (distribution.get(type) || 0) + 1);
        });
        return Object.fromEntries(distribution);
    }

    /**
     * Get available chunking strategies
     */
    getAvailableStrategies() {
        return Object.keys(this.strategies);
    }

    /**
     * Get strategy information
     */
    getStrategyInfo(strategyName) {
        return this.strategies[strategyName] ? { ...this.strategies[strategyName] } : null;
    }

    /**
     * Get chunking statistics
     */
    getStats() {
        return { ...this.chunkingStats };
    }

    /**
     * Reset statistics
     */
    resetStats() {
        this.chunkingStats = {
            totalChunks: 0,
            avgChunkSize: 0,
            chunksCreated: 0
        };
    }
}