Source: services/document/Chunker.js

import ContentChunker from '../../zpt/transform/ContentChunker.js';
import { createHash } from 'crypto';
import { v4 as uuidv4 } from 'uuid';
import logger from 'loglevel';

/**
 * Document chunking service with Ragno ontology compliance
 * Implements paragraph-level chunking with markdown header delimiters
 * Creates hash-based URIs and maps to Ragno classes
 */
export default class Chunker {
  constructor(options = {}) {
    this.config = {
      maxChunkSize: options.maxChunkSize || 2000,
      minChunkSize: options.minChunkSize || 100,
      overlapSize: options.overlapSize || 100,
      strategy: options.strategy || 'semantic',
      baseNamespace: options.baseNamespace || 'http://example.org/semem/',
      ...options
    };

    // Initialize ZPT ContentChunker with our settings
    this.contentChunker = new ContentChunker({
      defaultChunkSize: this.config.maxChunkSize,
      maxChunkSize: this.config.maxChunkSize,
      minChunkSize: this.config.minChunkSize,
      overlapSize: this.config.overlapSize,
      preserveStructure: true,
      semanticBoundaries: true,
      balanceChunks: true
    });
  }

  /**
   * Chunk markdown content with Ragno compliance
   * @param {string} markdown - Markdown content to chunk
   * @param {Object} metadata - Source document metadata
   * @param {Object} options - Chunking options
   * @returns {Promise<Object>} Chunking result with Ragno-compliant data structures
   */
  async chunk(markdown, metadata = {}, options = {}) {
    if (!markdown || typeof markdown !== 'string') {
      throw new Error('Chunker: markdown content is required and must be a string');
    }

    if (markdown.trim().length === 0) {
      throw new Error('Chunker: markdown content is empty');
    }

    try {
      const startTime = Date.now();
      const chunkingOptions = { ...this.config, ...options };

      // Use ZPT ContentChunker for initial chunking
      const chunkingResult = await this.contentChunker.chunk(markdown, {
        strategy: chunkingOptions.strategy,
        maxChunkSize: chunkingOptions.maxChunkSize,
        minChunkSize: chunkingOptions.minChunkSize,
        overlapSize: chunkingOptions.overlapSize,
        preserveStructure: true,
        semanticBoundaries: true
      });

      const chunks = chunkingResult.chunks;
      const processingTime = Date.now() - startTime;

      // Create source document URI
      const sourceUri = this.mintDocumentUri(metadata);

      // Convert chunks to Ragno-compliant format
      const ragnoChunks = await this.createRagnoChunks(chunks, sourceUri, metadata);

      // Create corpus and community structures
      const corpus = this.createCorpus(sourceUri, ragnoChunks, metadata);
      const community = this.createCommunity(sourceUri, ragnoChunks, metadata);

      logger.debug(`Chunker: Created ${ragnoChunks.length} chunks from ${markdown.length} chars in ${processingTime}ms`);

      return {
        chunks: ragnoChunks,
        corpus,
        community,
        sourceUri,
        metadata: {
          ...metadata,
          chunking: {
            chunkCount: ragnoChunks.length,
            strategy: chunkingOptions.strategy,
            processingTime,
            avgChunkSize: Math.round(ragnoChunks.reduce((sum, c) => sum + c.size, 0) / ragnoChunks.length),
            ...chunkingResult.metadata
          }
        },
        success: true
      };

    } catch (error) {
      logger.error('Chunker: Error during chunking:', error.message);
      throw new Error(`Chunker: Failed to chunk content: ${error.message}`);
    }
  }

  /**
   * Create Ragno-compliant chunk objects
   * @private
   * @param {Array} chunks - Raw chunks from ContentChunker
   * @param {string} sourceUri - Source document URI
   * @param {Object} metadata - Source metadata
   * @returns {Promise<Array>} Ragno-compliant chunks
   */
  async createRagnoChunks(chunks, sourceUri, metadata) {
    const ragnoChunks = [];

    for (let i = 0; i < chunks.length; i++) {
      const chunk = chunks[i];
      const chunkUri = this.mintChunkUri(chunk.content, sourceUri, i);
      
      // Extract title from content if it starts with header
      const title = this.extractTitle(chunk.content);
      
      const ragnoChunk = {
        // Core properties
        uri: chunkUri,
        type: 'ragno:TextElement',
        content: chunk.content,
        size: chunk.size,
        
        // Ragno-specific properties
        isCorpuscle: true,
        title: title || `Chunk ${i + 1}`,
        index: i,
        
        // Relationships
        partOf: sourceUri,
        position: chunk.position || { start: 0, end: chunk.size },
        
        // Metadata
        metadata: {
          chunkId: chunk.id,
          strategy: chunk.type,
          originalMetadata: chunk.metadata || {},
          sourceFormat: metadata.format,
          processingTimestamp: new Date().toISOString(),
          hash: this.createContentHash(chunk.content)
        },

        // PROV-O provenance
        provenance: {
          wasGeneratedBy: 'chunking_activity',
          wasDerivedFrom: sourceUri,
          generatedAtTime: new Date().toISOString(),
          wasAttributedTo: 'semem:Chunker'
        }
      };

      ragnoChunks.push(ragnoChunk);
    }

    return ragnoChunks;
  }

  /**
   * Create Ragno Corpus structure
   * @private
   * @param {string} sourceUri - Source document URI
   * @param {Array} chunks - Ragno chunks
   * @param {Object} metadata - Source metadata
   * @returns {Object} Ragno Corpus
   */
  createCorpus(sourceUri, chunks, metadata) {
    const corpusUri = this.mintCorpusUri(sourceUri);
    
    return {
      uri: corpusUri,
      type: 'ragno:Corpus',
      label: metadata.title || 'Document Corpus',
      description: `Corpus created from ${metadata.sourceFile || 'document'}`,
      
      // SKOS Collection properties
      hasElement: chunks.map(c => c.uri),
      memberCount: chunks.length,
      
      // Source reference
      wasDerivedFrom: sourceUri,
      
      metadata: {
        sourceFormat: metadata.format,
        totalSize: chunks.reduce((sum, c) => sum + c.size, 0),
        createdAt: new Date().toISOString(),
        sourceMetadata: metadata
      }
    };
  }

  /**
   * Create Ragno Community structure
   * @private
   * @param {string} sourceUri - Source document URI  
   * @param {Array} chunks - Ragno chunks
   * @param {Object} metadata - Source metadata
   * @returns {Object} Ragno Community
   */
  createCommunity(sourceUri, chunks, metadata) {
    const communityUri = this.mintCommunityUri(sourceUri);
    
    return {
      uri: communityUri,
      type: 'ragno:Community',
      label: metadata.title ? `${metadata.title} Community` : 'Document Community',
      description: `Community of text elements from ${metadata.sourceFile || 'document'}`,
      
      // Community-specific properties
      hasCommunityElement: chunks.map(c => ({
        element: c.uri,
        type: 'ragno:CommunityElement'
      })),
      
      // Relationships
      basedOn: sourceUri,
      
      metadata: {
        elementCount: chunks.length,
        avgElementSize: Math.round(chunks.reduce((sum, c) => sum + c.size, 0) / chunks.length),
        createdAt: new Date().toISOString(),
        cohesion: this.calculateCohesion(chunks)
      }
    };
  }

  /**
   * Mint URI for source document
   * @private
   * @param {Object} metadata - Document metadata
   * @returns {string} Document URI
   */
  mintDocumentUri(metadata) {
    const identifier = metadata.sourceFile || 
                      metadata.conversionId || 
                      this.createContentHash(JSON.stringify(metadata));
    
    return `${this.config.baseNamespace}document/${this.createContentHash(identifier)}`;
  }

  /**
   * Mint URI for chunk
   * @private
   * @param {string} content - Chunk content
   * @param {string} sourceUri - Source document URI
   * @param {number} index - Chunk index
   * @returns {string} Chunk URI
   */
  mintChunkUri(content, sourceUri, index) {
    const contentHash = this.createContentHash(content);
    const sourceHash = this.createContentHash(sourceUri);
    
    return `${this.config.baseNamespace}chunk/${sourceHash}_${index}_${contentHash}`;
  }

  /**
   * Mint URI for corpus
   * @private
   * @param {string} sourceUri - Source document URI
   * @returns {string} Corpus URI
   */
  mintCorpusUri(sourceUri) {
    const sourceHash = this.createContentHash(sourceUri);
    return `${this.config.baseNamespace}corpus/${sourceHash}`;
  }

  /**
   * Mint URI for community
   * @private
   * @param {string} sourceUri - Source document URI
   * @returns {string} Community URI
   */
  mintCommunityUri(sourceUri) {
    const sourceHash = this.createContentHash(sourceUri);
    return `${this.config.baseNamespace}community/${sourceHash}`;
  }

  /**
   * Create SHA-256 hash of content
   * @private
   * @param {string} content - Content to hash
   * @returns {string} Hex hash
   */
  createContentHash(content) {
    return createHash('sha256').update(content, 'utf8').digest('hex').substring(0, 16);
  }

  /**
   * Extract title from chunk content (if starts with markdown header)
   * @private
   * @param {string} content - Chunk content
   * @returns {string|null} Extracted title or null
   */
  extractTitle(content) {
    if (!content) return null;
    
    const lines = content.trim().split('\n');
    const firstLine = lines[0].trim();
    
    // Check for markdown headers
    const headerMatch = firstLine.match(/^(#{1,6})\s+(.+)$/);
    if (headerMatch) {
      return headerMatch[2].trim();
    }
    
    // Check if first line could be a title (short, no punctuation at end)
    if (firstLine.length < 100 && !firstLine.endsWith('.') && !firstLine.endsWith('!') && !firstLine.endsWith('?')) {
      return firstLine;
    }
    
    return null;
  }

  /**
   * Calculate cohesion score for community
   * @private
   * @param {Array} chunks - Chunks to analyze
   * @returns {number} Cohesion score (0-1)
   */
  calculateCohesion(chunks) {
    if (chunks.length <= 1) return 1.0;
    
    // Simple cohesion based on size variance - more uniform sizes = higher cohesion
    const sizes = chunks.map(c => c.size);
    const avg = sizes.reduce((a, b) => a + b, 0) / sizes.length;
    const variance = sizes.reduce((sum, size) => sum + Math.pow(size - avg, 2), 0) / sizes.length;
    const stdDev = Math.sqrt(variance);
    
    // Normalize to 0-1 scale
    const cohesion = Math.max(0, 1 - (stdDev / avg));
    return Math.round(cohesion * 100) / 100;
  }

  /**
   * Validate chunking configuration
   * @param {Object} config - Configuration to validate
   * @returns {Object} Validation result
   */
  static validateConfig(config) {
    const errors = [];
    const warnings = [];

    if (config.maxChunkSize && config.maxChunkSize < 100) {
      errors.push('maxChunkSize must be at least 100 characters');
    }

    if (config.minChunkSize && config.minChunkSize < 10) {
      errors.push('minChunkSize must be at least 10 characters');
    }

    if (config.maxChunkSize && config.minChunkSize && config.maxChunkSize <= config.minChunkSize) {
      errors.push('maxChunkSize must be greater than minChunkSize');
    }

    if (config.overlapSize && config.overlapSize < 0) {
      errors.push('overlapSize cannot be negative');
    }

    if (config.strategy && !['fixed', 'semantic', 'adaptive', 'hierarchical', 'token_aware'].includes(config.strategy)) {
      warnings.push(`Unknown chunking strategy: ${config.strategy}`);
    }

    return {
      valid: errors.length === 0,
      errors,
      warnings
    };
  }

  /**
   * Get available chunking strategies
   * @returns {Array<string>} Available strategies
   */
  static getAvailableStrategies() {
    return ['fixed', 'semantic', 'adaptive', 'hierarchical', 'token_aware'];
  }

  /**
   * Get default configuration
   * @returns {Object} Default configuration
   */
  static getDefaultConfig() {
    return {
      maxChunkSize: 2000,
      minChunkSize: 100,
      overlapSize: 100,
      strategy: 'semantic',
      baseNamespace: 'http://example.org/semem/'
    };
  }
}