Source: ragno/augmentWithAttributes.js

/**
 * Ragno: Node Importance-Based Augmentation - RDF-Ext Version
 * 
 * This module selects important entities and generates comprehensive attribute
 * summaries using advanced graph algorithms and LLM analysis. It integrates
 * with the ragno search system to create rich, searchable entity profiles.
 */

import rdf from 'rdf-ext'
import Attribute from './Attribute.js'
import RDFGraphManager from './core/RDFGraphManager.js'
import NamespaceManager from './core/NamespaceManager.js'
import { GraphAnalytics } from './algorithms/index.js'
import { logger } from '../Utils.js'

/**
 * Augment entities with comprehensive attributes using graph analysis and LLM
 * @param {Object} graphData - Decomposition results with RDF dataset
 * @param {Object} llmHandler - LLM handler instance
 * @param {Object} [options] - Augmentation options
 * @returns {Promise<{attributes: Attribute[], dataset: Dataset, statistics: Object}>}
 */
export async function augmentWithAttributes(graphData, llmHandler, options = {}) {
  const startTime = Date.now()
  logger.info('Starting entity attribute augmentation...')
  
  const opts = {
    // Selection criteria
    topK: options.topK || 10,
    importanceMethod: options.importanceMethod || 'hybrid', // 'degree', 'kcore', 'centrality', 'hybrid'
    minImportanceScore: options.minImportanceScore || 0.1,
    
    // Attribute generation
    attributeTypes: options.attributeTypes || [
      'overview', 'characteristics', 'relationships', 'context', 'significance'
    ],
    maxContextLength: options.maxContextLength || 2000,
    includeEvidence: options.includeEvidence !== false,
    
    // Quality control
    minAttributeLength: options.minAttributeLength || 50,
    maxAttributeLength: options.maxAttributeLength || 500,
    confidenceThreshold: options.confidenceThreshold || 0.3,
    
    ...options
  }
  
  // Initialize RDF infrastructure
  const namespaceManager = new NamespaceManager()
  const rdfManager = new RDFGraphManager({ namespace: namespaceManager })
  const resultDataset = rdf.dataset()
  
  // Copy existing dataset
  if (graphData.dataset) {
    for (const quad of graphData.dataset) {
      resultDataset.add(quad)
    }
  }
  
  try {
    // Phase 1: Analyze graph structure to identify important entities
    const importantEntities = await identifyImportantEntities(
      graphData, 
      opts.importanceMethod, 
      opts.topK,
      opts.minImportanceScore
    )
    
    logger.info(`Selected ${importantEntities.length} important entities for augmentation`)
    
    // Phase 2: Generate comprehensive attributes for each important entity
    const attributes = []
    const attributeStats = {
      totalGenerated: 0,
      byType: new Map(),
      averageLength: 0,
      averageConfidence: 0
    }
    
    for (const entityData of importantEntities) {
      logger.debug(`Augmenting entity: ${entityData.entity.getPreferredLabel()}`)
      
      // Gather comprehensive context for the entity
      const entityContext = await gatherEntityContext(entityData.entity, graphData, opts)
      
      // Generate multiple types of attributes
      for (const attributeType of opts.attributeTypes) {
        try {
          const attributeData = await generateEntityAttribute(
            entityData.entity,
            entityContext,
            attributeType,
            llmHandler,
            opts
          )
          
          if (attributeData && attributeData.content.length >= opts.minAttributeLength) {
            // Create RDF-based Attribute
            const attribute = new Attribute(rdfManager, {
              id: `attr_${entityData.entity.getPreferredLabel()}_${attributeType}_${attributes.length}`,
              entity: entityData.entity.getURI(),
              category: attributeType,
              content: attributeData.content,
              confidence: attributeData.confidence,
              keywords: attributeData.keywords || [],
              evidence: attributeData.evidence || [],
              temporal: attributeData.temporal || null,
              provenance: `LLM-generated ${attributeType} attribute`
            })
            
            attributes.push(attribute)
            attribute.exportToDataset(resultDataset)
            
            // Update statistics
            attributeStats.totalGenerated++
            const typeCount = attributeStats.byType.get(attributeType) || 0
            attributeStats.byType.set(attributeType, typeCount + 1)
            attributeStats.averageLength = (attributeStats.averageLength * (attributes.length - 1) + attributeData.content.length) / attributes.length
            attributeStats.averageConfidence = (attributeStats.averageConfidence * (attributes.length - 1) + attributeData.confidence) / attributes.length
            
            logger.debug(`Generated ${attributeType} attribute: ${attributeData.content.length} chars, confidence: ${attributeData.confidence}`)
          }
          
        } catch (error) {
          logger.warn(`Failed to generate ${attributeType} attribute for ${entityData.entity.getPreferredLabel()}:`, error.message)
        }
      }
    }
    
    // Phase 3: Create cross-attribute relationships and insights
    await createAttributeRelationships(attributes, resultDataset, rdfManager)
    
    const processingTime = Date.now() - startTime
    logger.info(`Attribute augmentation completed in ${processingTime}ms: ${attributes.length} attributes generated`)
    
    return {
      attributes,
      dataset: resultDataset,
      statistics: {
        processingTime,
        entitiesProcessed: importantEntities.length,
        attributesGenerated: attributes.length,
        attributeStats,
        originalStats: graphData.statistics
      }
    }
    
  } catch (error) {
    logger.error('Attribute augmentation failed:', error)
    throw error
  }
}

/**
 * Identify important entities using graph analysis algorithms
 * @param {Object} graphData - Graph data with entities and relationships
 * @param {string} method - Importance calculation method
 * @param {number} topK - Number of top entities to select
 * @param {number} minScore - Minimum importance score threshold
 * @returns {Promise<Array>} Array of important entity data objects
 */
async function identifyImportantEntities(graphData, method, topK, minScore) {
  logger.debug(`Analyzing entity importance using method: ${method}`)
  
  const { entities, dataset } = graphData
  const entityScores = new Map()
  
  // Initialize scores
  for (const entity of entities) {
    entityScores.set(entity.getURI(), {
      entity: entity,
      degreeScore: 0,
      kcoreScore: 0,
      centralityScore: 0,
      compositeScore: 0
    })
  }
  
  try {
    // Run graph algorithms for importance calculation
    const graphAnalytics = new GraphAnalytics()
    const graph = graphAnalytics.buildGraphFromRDF(dataset)
    
    if (graph.nodes.size === 0) {
      logger.warn('Empty graph for importance analysis')
      return entities.slice(0, topK).map(entity => ({ entity, importance: 0.5 }))
    }
    
    // Calculate degree-based importance
    const degreeStats = graphAnalytics.computeGraphStatistics(graph)
    const maxDegree = Math.max(...Array.from(degreeStats.degreeDistribution.values()))
    
    for (const [nodeUri, degree] of degreeStats.degreeDistribution) {
      if (entityScores.has(nodeUri)) {
        entityScores.get(nodeUri).degreeScore = degree / maxDegree
      }
    }
    
    // Calculate k-core based importance if graph is large enough
    if (graph.nodes.size > 2) {
      const kcoreResults = graphAnalytics.computeKCore(graph)
      const maxCore = Math.max(...Array.from(kcoreResults.coreNumbers.values()))
      
      if (maxCore > 0) {
        for (const [nodeUri, coreNumber] of kcoreResults.coreNumbers) {
          if (entityScores.has(nodeUri)) {
            entityScores.get(nodeUri).kcoreScore = coreNumber / maxCore
          }
        }
      }
    }
    
    // Calculate centrality for smaller graphs
    if (graph.nodes.size <= 500) {
      const centralityResults = graphAnalytics.computeBetweennessCentrality(graph)
      const maxCentrality = Math.max(...Array.from(centralityResults.centrality.values()))
      
      if (maxCentrality > 0) {
        for (const [nodeUri, centrality] of centralityResults.centrality) {
          if (entityScores.has(nodeUri)) {
            entityScores.get(nodeUri).centralityScore = centrality / maxCentrality
          }
        }
      }
    }
    
    // Calculate composite scores based on method
    for (const [uri, scores] of entityScores) {
      switch (method) {
        case 'degree':
          scores.compositeScore = scores.degreeScore
          break
        case 'kcore':
          scores.compositeScore = scores.kcoreScore
          break
        case 'centrality':
          scores.compositeScore = scores.centralityScore
          break
        case 'hybrid':
        default:
          scores.compositeScore = (
            scores.degreeScore * 0.4 +
            scores.kcoreScore * 0.4 +
            scores.centralityScore * 0.2
          )
          break
      }
    }
    
    // Sort by composite score and filter by minimum threshold
    const importantEntities = Array.from(entityScores.values())
      .filter(data => data.compositeScore >= minScore)
      .sort((a, b) => b.compositeScore - a.compositeScore)
      .slice(0, topK)
      .map(data => ({
        entity: data.entity,
        importance: data.compositeScore,
        degreeScore: data.degreeScore,
        kcoreScore: data.kcoreScore,
        centralityScore: data.centralityScore
      }))
    
    logger.debug(`Selected ${importantEntities.length} entities with scores >= ${minScore}`)
    return importantEntities
    
  } catch (error) {
    logger.warn('Graph analysis failed, using fallback degree calculation:', error.message)
    
    // Fallback: simple degree calculation
    const connections = new Map()
    
    for (const entity of entities) {
      connections.set(entity.getURI(), 0)
    }
    
    // Count connections from relationships
    if (graphData.relationships) {
      for (const relationship of graphData.relationships) {
        const sourceUri = relationship.getSourceEntity()
        const targetUri = relationship.getTargetEntity()
        
        if (connections.has(sourceUri)) {
          connections.set(sourceUri, connections.get(sourceUri) + 1)
        }
        if (connections.has(targetUri)) {
          connections.set(targetUri, connections.get(targetUri) + 1)
        }
      }
    }
    
    const maxConnections = Math.max(...Array.from(connections.values()), 1)
    
    return entities
      .map(entity => ({
        entity: entity,
        importance: connections.get(entity.getURI()) / maxConnections
      }))
      .filter(data => data.importance >= minScore)
      .sort((a, b) => b.importance - a.importance)
      .slice(0, topK)
  }
}

/**
 * Gather comprehensive context for an entity
 * @param {Entity} entity - Entity to gather context for
 * @param {Object} graphData - Graph data
 * @param {Object} options - Context gathering options
 * @returns {Promise<Object>} Entity context object
 */
async function gatherEntityContext(entity, graphData, options) {
  const context = {
    entity: entity,
    units: [],
    relationships: [],
    relatedEntities: new Set(),
    contextText: '',
    evidence: []
  }
  
  const entityUri = entity.getURI()
  const entityLabel = entity.getPreferredLabel().toLowerCase()
  
  // Gather connected semantic units
  if (graphData.units) {
    for (const unit of graphData.units) {
      // Check if unit mentions this entity
      const unitContent = unit.getContent().toLowerCase()
      if (unitContent.includes(entityLabel) || unit.hasEntityMention(entityUri)) {
        context.units.push(unit)
        context.evidence.push(unit.getURI())
        
        if (context.contextText.length < options.maxContextLength) {
          context.contextText += unit.getContent() + '\n\n'
        }
      }
    }
  }
  
  // Gather relationships
  if (graphData.relationships) {
    for (const relationship of graphData.relationships) {
      if (relationship.getSourceEntity() === entityUri || relationship.getTargetEntity() === entityUri) {
        context.relationships.push(relationship)
        
        // Add related entities
        const otherEntityUri = relationship.getSourceEntity() === entityUri 
          ? relationship.getTargetEntity() 
          : relationship.getSourceEntity()
        context.relatedEntities.add(otherEntityUri)
        
        context.evidence.push(relationship.getURI())
      }
    }
  }
  
  // Trim context text if too long
  if (context.contextText.length > options.maxContextLength) {
    context.contextText = context.contextText.substring(0, options.maxContextLength) + '...'
  }
  
  logger.debug(`Gathered context for ${entity.getPreferredLabel()}: ${context.units.length} units, ${context.relationships.length} relationships`)
  
  return context
}

/**
 * Generate a specific type of attribute for an entity
 * @param {Entity} entity - Entity to generate attribute for
 * @param {Object} context - Entity context
 * @param {string} attributeType - Type of attribute to generate
 * @param {Object} llmHandler - LLM handler
 * @param {Object} options - Generation options
 * @returns {Promise<Object>} Generated attribute data
 */
async function generateEntityAttribute(entity, context, attributeType, llmHandler, options) {
  const entityName = entity.getPreferredLabel()
  const relatedEntities = Array.from(context.relatedEntities).slice(0, 5) // Limit for context
  
  // Build type-specific prompts
  const prompts = {
    overview: `Provide a comprehensive overview of ${entityName} based on the following information. Focus on who/what they are, their primary role or significance, and key characteristics.

Context: ${context.contextText}

Related entities: ${relatedEntities.join(', ')}

Write a 2-3 sentence overview:`,

    characteristics: `Describe the key characteristics, traits, and distinctive features of ${entityName} based on the provided context.

Context: ${context.contextText}

List the main characteristics in 2-3 sentences:`,

    relationships: `Summarize the key relationships and connections of ${entityName} with other entities, based on the provided information.

Context: ${context.contextText}
Related entities: ${relatedEntities.join(', ')}

Describe the main relationships in 2-3 sentences:`,

    context: `Explain the broader context, setting, or environment in which ${entityName} operates or exists.

Context: ${context.contextText}

Describe the context in 2-3 sentences:`,

    significance: `Analyze the importance and significance of ${entityName} within the broader narrative or domain.

Context: ${context.contextText}

Explain the significance in 2-3 sentences:`
  }
  
  const prompt = prompts[attributeType] || prompts.overview
  
  try {
    const response = await llmHandler.generateResponse(prompt, '', {
      maxTokens: 200,
      temperature: 0.1
    })
    
    const content = response.trim()
    
    if (content.length < options.minAttributeLength) {
      logger.debug(`Generated ${attributeType} attribute too short: ${content.length} chars`)
      return null
    }
    
    // Extract keywords from the generated content
    const keywords = extractKeywords(content)
    
    // Calculate confidence based on context quality
    const confidence = calculateAttributeConfidence(context, content, options)
    
    return {
      content: content,
      confidence: confidence,
      keywords: keywords,
      evidence: context.evidence,
      temporal: null // Could be enhanced with temporal extraction
    }
    
  } catch (error) {
    logger.warn(`Failed to generate ${attributeType} attribute for ${entityName}:`, error.message)
    return null
  }
}

/**
 * Extract keywords from attribute content
 * @param {string} content - Attribute content
 * @returns {Array<string>} Extracted keywords
 */
function extractKeywords(content) {
  // Simple keyword extraction - could be enhanced with NLP
  const words = content.toLowerCase()
    .replace(/[^\w\s]/g, ' ')
    .split(/\s+/)
    .filter(word => word.length > 3 && !isStopWord(word))
  
  // Get unique words and sort by frequency
  const wordCounts = new Map()
  for (const word of words) {
    wordCounts.set(word, (wordCounts.get(word) || 0) + 1)
  }
  
  return Array.from(wordCounts.entries())
    .sort((a, b) => b[1] - a[1])
    .slice(0, 5)
    .map(([word]) => word)
}

/**
 * Check if word is a stop word
 * @param {string} word - Word to check
 * @returns {boolean} True if stop word
 */
function isStopWord(word) {
  const stopWords = new Set([
    'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
    'this', 'that', 'these', 'those', 'is', 'are', 'was', 'were', 'be', 'been',
    'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may', 'might'
  ])
  return stopWords.has(word)
}

/**
 * Calculate confidence score for generated attribute
 * @param {Object} context - Entity context
 * @param {string} content - Generated content
 * @param {Object} options - Calculation options
 * @returns {number} Confidence score 0-1
 */
function calculateAttributeConfidence(context, content, options) {
  let confidence = 0.5 // Base confidence
  
  // Factor in context quality
  if (context.units.length > 0) {
    confidence += Math.min(context.units.length * 0.1, 0.3)
  }
  
  if (context.relationships.length > 0) {
    confidence += Math.min(context.relationships.length * 0.05, 0.2)
  }
  
  // Factor in content quality
  if (content.length > options.minAttributeLength * 2) {
    confidence += 0.1
  }
  
  // Factor in evidence
  if (context.evidence.length > 2) {
    confidence += 0.1
  }
  
  return Math.min(confidence, 1.0)
}

/**
 * Create relationships between attributes for cross-referencing
 * @param {Array<Attribute>} attributes - Generated attributes
 * @param {Dataset} dataset - RDF dataset
 * @param {RDFGraphManager} rdfManager - RDF manager
 */
async function createAttributeRelationships(attributes, dataset, rdfManager) {
  logger.debug('Creating cross-attribute relationships...')
  
  // Group attributes by entity
  const entityAttributes = new Map()
  for (const attribute of attributes) {
    const entityUri = attribute.getEntity()
    if (!entityAttributes.has(entityUri)) {
      entityAttributes.set(entityUri, [])
    }
    entityAttributes.get(entityUri).push(attribute)
  }
  
  // Create relationships within entity attribute groups
  for (const [entityUri, entityAttrs] of entityAttributes) {
    if (entityAttrs.length < 2) continue
    
    for (let i = 0; i < entityAttrs.length; i++) {
      for (let j = i + 1; j < entityAttrs.length; j++) {
        const attr1 = entityAttrs[i]
        const attr2 = entityAttrs[j]
        
        // Create complementary relationship
        const relationship = new (await import('./Relationship.js')).default(rdfManager, {
          id: `attr_rel_${i}_${j}`,
          sourceEntity: attr1.getURI(),
          targetEntity: attr2.getURI(),
          relationshipType: 'complements',
          content: `${attr1.getCategory()} complements ${attr2.getCategory()}`,
          weight: 0.5,
          bidirectional: true
        })
        
        relationship.exportToDataset(dataset)
      }
    }
  }
  
  logger.debug(`Created cross-attribute relationships for ${entityAttributes.size} entities`)
}