Source: ragno/decomposeCorpus.js

/**
 * Ragno: Corpus Decomposition Logic (Step 1) - RDF-Ext Version
 * 
 * This module decomposes text chunks into RDF-based semantic units, entities, 
 * and relationships following the ragno ontology. It integrates with Semem's
 * LLMHandler and creates proper RDF resources for the knowledge graph.
 */

import rdf from 'rdf-ext'
import SemanticUnit from './SemanticUnit.js'
import Entity from './Entity.js'
import Relationship from './Relationship.js'
import RDFGraphManager from './core/RDFGraphManager.js'
import NamespaceManager from './core/NamespaceManager.js'
import SPARQLHelpers from '../utils/SPARQLHelpers.js'
import ParseHelper from '../utils/ParseHelper.js'
import { logger } from '../Utils.js'

/**
 * Decompose text chunks into RDF-based semantic units, entities, and relationships
 * @param {Array<{content: string, source: string}>} textChunks - Text chunks to decompose
 * @param {Object} llmHandler - Instance of Semem's LLMHandler
 * @param {Object} [options] - Decomposition options
 * @returns {Promise<{units: SemanticUnit[], entities: Entity[], relationships: Relationship[], dataset: Dataset}>}
 */
export async function decomposeCorpus(textChunks, llmHandler, options = {}) {
  const startTime = Date.now()
  logger.info(`Starting corpus decomposition: ${textChunks.length} chunks`)

  const opts = {
    extractRelationships: options.extractRelationships !== false,
    generateSummaries: options.generateSummaries !== false,
    minEntityConfidence: options.minEntityConfidence || 0.3,
    maxEntitiesPerUnit: options.maxEntitiesPerUnit || 10,
    chunkOverlap: options.chunkOverlap || 0.1,
    ...options
  }

  // Initialize RDF infrastructure
  const namespaceManager = new NamespaceManager()
  const rdfManager = new RDFGraphManager({ namespace: namespaceManager })
  const dataset = rdf.dataset()

  // Collections for results
  const units = []
  const entitiesMap = new Map() // name -> Entity
  const relationships = []
  const unitEntityConnections = [] // Track unit-entity connections

  try {
    // Phase 1: Process each chunk into semantic units
    for (let chunkIndex = 0; chunkIndex < textChunks.length; chunkIndex++) {
      const chunk = textChunks[chunkIndex]
      logger.debug(`Processing chunk ${chunkIndex + 1}/${textChunks.length}`)

      // Extract semantic units from chunk using LLM
      const unitTexts = await extractSemanticUnits(chunk.content, llmHandler, opts)

      // Create SemanticUnit objects
      for (let unitIndex = 0; unitIndex < unitTexts.length; unitIndex++) {
        const unitText = unitTexts[unitIndex]
        const unitId = `unit_${chunkIndex}_${unitIndex}`

        // Generate summary if requested
        let summary = ''
        if (opts.generateSummaries && unitText.length > 100) {
          summary = await generateUnitSummary(unitText, llmHandler)
        }

        // Create RDF-based SemanticUnit
        const unit = new SemanticUnit({
          dataset: rdf.dataset(),
          text: unitText,
          summary: summary,
          source: chunk.source,
          position: 0, // Start position
          length: unitText.length // Content length
        })

        units.push(unit)

        // Add to RDF dataset
        unit.exportToDataset(dataset)

        // Extract entities from this unit
        const unitEntities = await extractEntitiesFromUnit(unitText, llmHandler, opts)

        // Process entities and create connections
        for (const entityData of unitEntities) {
          let entity = entitiesMap.get(entityData.name)

          if (!entity) {
            // Create new Entity
            entity = new Entity({
              name: entityData.name,
              isEntryPoint: entityData.isEntryPoint || false,
              subType: entityData.type || 'general',
              confidence: entityData.confidence || 1.0,
              alternativeLabels: entityData.alternatives || [],
              source: chunk.source
            })

            entitiesMap.set(entityData.name, entity)
            entity.exportToDataset(dataset)
          } else {
            // Update existing entity
            entity.incrementFrequency()
            entity.addSource(chunk.source)
          }

          // Create unit-entity connection
          unit.addEntityMention(entity.getURI(), entityData.relevance || 1.0)
          unitEntityConnections.push({
            unit: unit,
            entity: entity,
            relevance: entityData.relevance || 1.0,
            context: unitText
          })
        }

        logger.debug(`Unit ${unitId}: ${unitEntities.length} entities extracted`)
      }
    }

    // Phase 2: Extract relationships between entities
    if (opts.extractRelationships && entitiesMap.size > 1) {
      logger.info('Phase 2: Extracting relationships between entities...')

      const entityList = Array.from(entitiesMap.values())
      const relationshipData = await extractRelationships(entityList, units, llmHandler, opts)

      for (const relData of relationshipData) {
        const sourceEntity = entitiesMap.get(relData.source)
        const targetEntity = entitiesMap.get(relData.target)

        if (sourceEntity && targetEntity) {
          const relationship = new Relationship({
            id: `rel_${relationships.length}`,
            sourceEntity: sourceEntity.getURI(),
            targetEntity: targetEntity.getURI(),
            relationshipType: relData.type || 'related',
            content: relData.content || '',
            weight: relData.weight || 0.5,
            evidence: relData.evidence || [],
            bidirectional: relData.bidirectional || false
          })

          relationships.push(relationship)
          relationship.exportToDataset(dataset)

          // Add relationship triples using RDF graph manager
          const relUri = relationship.getURI()
          const relNode = rdf.namedNode(relUri)
          
          // Add relationship type
          rdfManager.addTriple(relNode, rdfManager.ns.rdf.type, rdfManager.ns.ragno.Relationship)
          
          // Connect relationship to source and target entities
          rdfManager.addTriple(rdf.namedNode(sourceEntity.getURI()), rdfManager.ns.ragno.hasRelationship, relNode)
          rdfManager.addTriple(relNode, rdfManager.ns.ragno.hasSource, rdf.namedNode(sourceEntity.getURI()))
          rdfManager.addTriple(relNode, rdfManager.ns.ragno.hasTarget, rdf.namedNode(targetEntity.getURI()))
          
          // Add inverse relationship if bidirectional
          if (relData.bidirectional) {
            rdfManager.addTriple(rdf.namedNode(targetEntity.getURI()), rdfManager.ns.ragno.hasRelationship, relNode)
          }
        }
      }

      logger.info(`Created ${relationships.length} relationships`)
    }

    // Phase 3: Create inter-unit relationships for coherence
    await createInterUnitRelationships(units, dataset, rdfManager)

    const processingTime = Date.now() - startTime
    logger.info(`Corpus decomposition completed in ${processingTime}ms: ${units.length} units, ${entitiesMap.size} entities, ${relationships.length} relationships`)

    return {
      units,
      entities: Array.from(entitiesMap.values()),
      relationships,
      dataset,
      connections: unitEntityConnections,
      statistics: {
        processingTime,
        totalChunks: textChunks.length,
        totalUnits: units.length,
        totalEntities: entitiesMap.size,
        totalRelationships: relationships.length,
        averageEntitiesPerUnit: entitiesMap.size / units.length
      }
    }

  } catch (error) {
    logger.error('Corpus decomposition failed:', error)
    throw error
  }
}

/**
 * Extract semantic units from text using LLM
 * @param {string} text - Input text
 * @param {Object} llmHandler - LLM handler
 * @param {Object} options - Extraction options
 * @returns {Promise<Array<string>>} Array of semantic unit texts
 */
async function extractSemanticUnits(text, llmHandler, options = {}) {
  const prompt = `Break down the following text into independent semantic units. Each unit should represent a complete thought, event, or concept that can stand alone. Return as a JSON array of strings.

Text: "${text}"

Return format: ["unit1", "unit2", "unit3"]

Semantic units:`

  try {
    const response = await llmHandler.generateResponse(prompt, '', {
      max_tokens: 1000,
      temperature: 0.1
    })

    // Parse LLM response using ParseHelper
    const cleanedResponse = ParseHelper.resolveSyntax(response)
    if (cleanedResponse === false) {
      logger.warn('ParseHelper could not resolve syntax, using original response')
      const units = JSON.parse(response.trim())
      return Array.isArray(units) ? units : [text]
    }
    
    const units = JSON.parse(cleanedResponse)
    return Array.isArray(units) ? units : [text] // Fallback to original text

  } catch (error) {
    logger.warn('LLM unit extraction failed, using sentence splitting fallback:', error.message)

    // Fallback: simple sentence splitting
    return text.split(/[.!?]+/)
      .map(sentence => sentence.trim())
      .filter(sentence => sentence.length > 10) // Filter out very short sentences
  }
}

/**
 * Generate summary for a semantic unit
 * @param {string} unitText - Unit text content
 * @param {Object} llmHandler - LLM handler
 * @returns {Promise<string>} Generated summary
 */
async function generateUnitSummary(unitText, llmHandler) {
  const prompt = `Provide a concise 1-2 sentence summary of the key concept or event in this text:

"${unitText}"

Summary:`

  try {
    const summary = await llmHandler.generateResponse(prompt, '', {
      max_tokens: 100,
      temperature: 0.1
    })

    return summary.trim()

  } catch (error) {
    logger.warn('Summary generation failed:', error.message)
    return unitText.length > 100 ? unitText.substring(0, 100) + '...' : unitText
  }
}

/**
 * Extract entities from a semantic unit
 * @param {string} unitText - Unit text content
 * @param {Object} llmHandler - LLM handler
 * @param {Object} options - Extraction options
 * @returns {Promise<Array<Object>>} Array of entity data objects
 */
async function extractEntitiesFromUnit(unitText, llmHandler, options = {}) {
  const prompt = `Extract the key entities (people, places, organizations, concepts) from this text. For each entity, provide name, type, relevance score (0-1), and whether it's an entry point (important/central entity).

Text: "${unitText}"

Return as JSON array:
[{"name": "entity1", "type": "person", "relevance": 0.9, "isEntryPoint": true, "confidence": 0.8}]

Entities:`

  try {
    const systemPrompt = "You are a helpful assistant that extracts entities from text. Return entities as a JSON array with name, type, relevance, isEntryPoint, and confidence.";
    const response = await llmHandler.generateResponse(prompt, '', {
      systemPrompt,
      max_tokens: 500,
      temperature: 0.1
    });

    // Use ParseHelper to clean the response
    const cleanedResponse = ParseHelper.resolveSyntax(response)
    if (cleanedResponse === false) {
      logger.warn('ParseHelper could not resolve syntax for entity extraction')
      throw new Error('Failed to parse entity extraction response')
    }

    const entities = JSON.parse(cleanedResponse)

    // Filter and validate entities
    return Array.isArray(entities) ? entities.filter(entity =>
      entity.name &&
      entity.name.length > 1 &&
      (entity.confidence || 1.0) >= options.minEntityConfidence
    ).slice(0, options.maxEntitiesPerUnit) : []

  } catch (error) {
    logger.warn('Entity extraction failed, using fallback:', error.message)

    // Fallback: extract capitalized words as potential entities
    const words = unitText.match(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g) || []
    return words.slice(0, options.maxEntitiesPerUnit).map(name => ({
      name: name,
      type: 'general',
      relevance: 0.5,
      isEntryPoint: false,
      confidence: 0.5
    }))
  }
}

/**
 * Extract relationships between entities
 * @param {Array<Entity>} entities - List of entities
 * @param {Array<SemanticUnit>} units - List of semantic units
 * @param {Object} llmHandler - LLM handler
 * @param {Object} options - Extraction options
 * @returns {Promise<Array<Object>>} Array of relationship data objects
 */
async function extractRelationships(entities, units, llmHandler, options = {}) {
  const relationships = []
  const entityNames = entities.map(e => {
    const label = e.getPreferredLabel ? e.getPreferredLabel() : '';
    return label || '';
  });

  // Process units that contain multiple entities
  for (const unit of units) {
    const unitEntityNames = entityNames.filter(name =>
      unit.getContent().toLowerCase().includes(name.toLowerCase())
    )

    if (unitEntityNames.length < 2) continue

    const prompt = `Identify relationships between these entities in the given text. Return relationships as JSON array with source, target, type, content, and weight (0-1).

Entities: [${unitEntityNames.map(name => `"${name}"`).join(', ')}]
Text: "${unit.getContent()}"

Return format:
[{"source": "Entity1", "target": "Entity2", "type": "collaborates_with", "content": "relationship description", "weight": 0.8}]

Relationships:`

    try {
      const systemPrompt = "You are a helpful assistant that identifies relationships between entities in text. Return relationships as a JSON array with source, target, type, content, and weight (0-1).";
      const response = await llmHandler.generateResponse(prompt, '', {
        systemPrompt,
        max_tokens: 300,
        temperature: 0.1
      });

      // Use ParseHelper to clean the response
      const cleanedResponse = ParseHelper.resolveSyntax(response)
      if (cleanedResponse === false) {
        logger.warn('ParseHelper could not resolve syntax for relationship extraction')
        continue // Skip this unit and continue with the next one
      }

      const unitRelationships = JSON.parse(cleanedResponse)

      if (Array.isArray(unitRelationships)) {
        for (const rel of unitRelationships) {
          if (rel.source && rel.target && rel.source !== rel.target) {
            relationships.push({
              ...rel,
              evidence: [unit.getURI()],
              bidirectional: rel.bidirectional || false
            })
          }
        }
      }

    } catch (error) {
      logger.warn(`Relationship extraction failed for unit: ${error.message}`)
    }
  }

  return relationships
}

/**
 * Create inter-unit relationships for coherence
 * @param {Array<SemanticUnit>} units - List of semantic units
 * @param {Dataset} dataset - RDF dataset
 * @param {RDFGraphManager} rdfManager - RDF graph manager
 */
async function createInterUnitRelationships(units, dataset, rdfManager) {
  logger.debug('Creating inter-unit relationships...')

  // Create simple sequential relationships between adjacent units
  for (let i = 0; i < units.length - 1; i++) {
    const currentUnit = units[i]
    const nextUnit = units[i + 1]

    // Create a "follows" relationship
    const relationshipId = `unit_rel_${i}`
    const relationship = new Relationship({
      id: relationshipId,
      sourceEntity: currentUnit.getURI(),
      targetEntity: nextUnit.getURI(),
      relationshipType: 'follows',
      content: 'Sequential narrative flow',
      weight: 0.3,
      bidirectional: false
    })

    relationship.exportToDataset(dataset)
  }

  logger.debug(`Created ${units.length - 1} inter-unit relationships`)
}

/**
 * Export decomposition results to SPARQL endpoint
 * @param {Object} decompositionResults - Results from decomposeCorpus
 * @param {string} endpoint - SPARQL endpoint URL
 * @param {Object} [auth] - Authentication credentials
 * @returns {Promise<Object>} Export statistics
 */
export async function exportToRDF(decompositionResults, endpoint, auth = null) {
  const { dataset, statistics } = decompositionResults
  const startTime = Date.now()

  logger.info(`Exporting decomposition results to SPARQL endpoint: ${endpoint}`)

  try {
    // Convert dataset to N-Triples for SPARQL insertion
    const serializer = require('@rdfjs/serializer-ntriples')
    const ntriplesStream = serializer.import(dataset.toStream())

    let ntriplesData = ''
    for await (const chunk of ntriplesStream) {
      ntriplesData += chunk
    }

    // Insert all triples at once
    const insertQuery = `INSERT DATA { ${ntriplesData} }`

    await SPARQLHelpers.executeSPARQLUpdate(endpoint, insertQuery, auth)

    const exportTime = Date.now() - startTime
    logger.info(`Export completed in ${exportTime}ms`)

    return {
      success: true,
      exportTime,
      triplesExported: dataset.size,
      originalStatistics: statistics,
      endpoint
    }

  } catch (error) {
    logger.error('RDF export failed:', error)
    throw error
  }
}