JSDoc: Source: ragno/algorithms/index.js

Source: ragno/algorithms/index.js
/**
 * algorithms/index.js - Ragno Graph Algorithms Suite
 * 
 * This module provides a comprehensive suite of graph algorithms optimized
 * for RDF-based knowledge graphs following the ragno ontology. It integrates
 * all the core algorithms needed for advanced graph analysis and semantic search.
 * 
 * Available Algorithms:
 * - K-core decomposition for node importance ranking
 * - Betweenness centrality for identifying bridge nodes
 * - Leiden clustering for community detection
 * - Personalized PageRank for semantic search traversal
 * - Graph connectivity and statistical analysis
 * 
 * Usage:
 * ```javascript
 * import RagnoAlgorithms from './algorithms/index.js'
 * 
 * const algorithms = new RagnoAlgorithms()
 * const results = await algorithms.runFullAnalysis(rdfDataset)
 * ```
 */

import rdf from 'rdf-ext'
import namespace from '@rdfjs/namespace'
import GraphAnalytics from './GraphAnalytics.js'
import CommunityDetection from './CommunityDetection.js'
import PersonalizedPageRank from './PersonalizedPageRank.js'
import Hyde from './Hyde.js'
import VSOM from './VSOM.js'
import { logger } from '../../Utils.js'

export default class RagnoAlgorithms {
    constructor(options = {}) {
        this.options = {
            // Graph analytics options
            maxIterations: options.maxIterations || 1000,
            convergenceThreshold: options.convergenceThreshold || 1e-6,
            
            // Community detection options
            resolution: options.resolution || 1.0,
            minCommunitySize: options.minCommunitySize || 3,
            
            // PPR options
            alpha: options.alpha || 0.15,
            topKPerType: options.topKPerType || 5,
            shallowIterations: options.shallowIterations || 2,
            deepIterations: options.deepIterations || 10,
            
            // General options
            logProgress: options.logProgress || false,
            exportToRDF: options.exportToRDF || false,
            ...options
        }
        
        // Initialize algorithm modules
        this.graphAnalytics = new GraphAnalytics(this.options)
        this.communityDetection = new CommunityDetection(this.options)
        this.personalizedPageRank = new PersonalizedPageRank(this.options)
        this.hyde = new Hyde(this.options)
        this.vsom = new VSOM(this.options)
        
        this.stats = {
            lastFullAnalysis: null,
            analysisCount: 0,
            totalProcessingTime: 0
        }
        
        logger.info('RagnoAlgorithms suite initialized')
    }
    
    /**
     * Run complete graph analysis pipeline
     * @param {Dataset} dataset - RDF-Ext dataset
     * @param {Object} [options] - Analysis options
     * @returns {Object} Complete analysis results
     */
    async runFullAnalysis(dataset, options = {}) {
        const startTime = Date.now()
        logger.info('Starting full Ragno graph analysis pipeline...')
        
        const opts = { ...this.options, ...options }
        const results = {
            timestamp: new Date(),
            options: opts,
            graph: null,
            statistics: null,
            kCore: null,
            centrality: null,
            communities: null,
            components: null,
            processingTime: 0
        }
        
        try {
            // Phase 1: Build graph representation from RDF
            logger.info('Phase 1: Building graph from RDF dataset...')
            const graph = this.graphAnalytics.buildGraphFromRDF(dataset, { undirected: true })
            results.graph = {
                nodeCount: graph.nodes.size,
                edgeCount: graph.edges.size,
                metadata: 'Graph built from RDF dataset'
            }
            
            if (graph.nodes.size === 0) {
                logger.warn('Empty graph - skipping analysis')
                return results
            }
            
            // Phase 2: Basic graph statistics
            logger.info('Phase 2: Computing graph statistics...')
            results.statistics = this.graphAnalytics.computeGraphStatistics(graph)
            
            // Phase 3: Structural analysis
            logger.info('Phase 3: Running structural analysis...')
            
            // K-core decomposition
            if (graph.nodes.size > 1) {
                results.kCore = this.graphAnalytics.computeKCore(graph)
            }
            
            // Betweenness centrality (skip for very large graphs)
            if (graph.nodes.size <= 1000) {
                results.centrality = this.graphAnalytics.computeBetweennessCentrality(graph)
            } else {
                logger.info('Skipping betweenness centrality for large graph (>1000 nodes)')
            }
            
            // Connected components
            results.components = this.graphAnalytics.findConnectedComponents(graph)
            
            // Phase 4: Community detection
            logger.info('Phase 4: Detecting communities...')
            if (graph.nodes.size > 2) {
                results.communities = this.communityDetection.detectCommunities(graph, opts)
            }
            
            // Phase 5: Export to RDF if requested
            if (opts.exportToRDF && opts.targetDataset) {
                logger.info('Phase 5: Exporting results to RDF...')
                this.exportAllResultsToRDF(results, opts.targetDataset)
            }
            
            const endTime = Date.now()
            results.processingTime = endTime - startTime
            
            // Update statistics
            this.stats.lastFullAnalysis = new Date()
            this.stats.analysisCount++
            this.stats.totalProcessingTime += results.processingTime
            
            logger.info(`Full analysis completed in ${results.processingTime}ms`)
            
            return results
            
        } catch (error) {
            logger.error('Error during full analysis:', error)
            throw error
        }
    }
    
    /**
     * Run semantic search using PPR
     * @param {Dataset} dataset - RDF-Ext dataset
     * @param {Array} queryEntities - Entity URIs to start search from
     * @param {Object} [options] - Search options
     * @returns {Object} Search results with ranked nodes
     */
    async runSemanticSearch(dataset, queryEntities, options = {}) {
        logger.info(`Running semantic search from ${queryEntities.length} entities...`)
        
        const opts = { ...this.options, ...options }
        
        // Build graph
        const graph = this.graphAnalytics.buildGraphFromRDF(dataset, { undirected: true })
        
        if (graph.nodes.size === 0) {
            logger.warn('Empty graph for semantic search')
            return { results: [], entryPoints: queryEntities }
        }
        
        // Run appropriate PPR based on options
        let pprResults
        if (opts.shallow) {
            pprResults = this.personalizedPageRank.runShallowPPR(graph, queryEntities, opts)
        } else if (opts.deep) {
            pprResults = this.personalizedPageRank.runDeepPPR(graph, queryEntities, opts)
        } else {
            pprResults = this.personalizedPageRank.runPPR(graph, queryEntities, opts)
        }
        
        // Enhance results with node metadata
        const enhancedResults = this.enhanceSearchResults(pprResults, graph, dataset)
        
        return enhancedResults
    }
    
    /**
     * Enhance search results with additional metadata
     * @param {Object} pprResults - PPR results
     * @param {Object} graph - Graph representation
     * @param {Dataset} dataset - Original RDF dataset
     * @returns {Object} Enhanced results
     */
    enhanceSearchResults(pprResults, graph, dataset) {
        const enhancedNodes = []
        
        for (const node of pprResults.rankedNodes) {
            const nodeData = graph.nodes.get(node.nodeUri)
            const enhanced = {
                ...node,
                metadata: {
                    type: nodeData?.type || 'unknown',
                    connections: graph.adjacency.get(node.nodeUri)?.size || 0
                }
            }
            
            // Add additional RDF properties if available
            const nodeTriples = [...dataset.match(nodeData ? rdf.namedNode(nodeData.uri) : null)]
            enhanced.metadata.tripleCount = nodeTriples.length
            
            enhancedNodes.push(enhanced)
        }
        
        return {
            ...pprResults,
            rankedNodes: enhancedNodes,
            enhanced: true
        }
    }
    
    /**
     * Run targeted analysis for specific algorithms
     * @param {Dataset} dataset - RDF-Ext dataset
     * @param {Array} algorithms - Array of algorithm names
     * @param {Object} [options] - Analysis options
     * @returns {Object} Targeted analysis results
     */
    async runTargetedAnalysis(dataset, algorithms, options = {}) {
        logger.info(`Running targeted analysis: ${algorithms.join(', ')}`)
        
        const graph = this.graphAnalytics.buildGraphFromRDF(dataset, { undirected: true })
        const results = {
            timestamp: new Date(),
            algorithms: algorithms,
            graph: { nodeCount: graph.nodes.size, edgeCount: graph.edges.size }
        }
        
        for (const algorithm of algorithms) {
            switch (algorithm.toLowerCase()) {
                case 'k-core':
                case 'kcore':
                    results.kCore = this.graphAnalytics.computeKCore(graph)
                    break
                    
                case 'centrality':
                case 'betweenness':
                    if (graph.nodes.size <= 1000) {
                        results.centrality = this.graphAnalytics.computeBetweennessCentrality(graph)
                    }
                    break
                    
                case 'communities':
                case 'leiden':
                    if (graph.nodes.size > 2) {
                        results.communities = this.communityDetection.detectCommunities(graph, options)
                    }
                    break
                    
                case 'components':
                    results.components = this.graphAnalytics.findConnectedComponents(graph)
                    break
                    
                case 'statistics':
                case 'stats':
                    results.statistics = this.graphAnalytics.computeGraphStatistics(graph)
                    break
                    
                case 'hyde':
                case 'hypothetical':
                    // Hyde requires different parameters - would need LLM handler
                    logger.info('Hyde algorithm requires LLM handler - use runHydeGeneration method')
                    break
                    
                case 'vsom':
                case 'clustering':
                    // VSOM requires different parameters - would need entity data and embeddings
                    logger.info('VSOM algorithm requires entity data and embeddings - use runEntityClustering method')
                    break
                    
                default:
                    logger.warn(`Unknown algorithm: ${algorithm}`)
            }
        }
        
        return results
    }
    
    /**
     * Get top-k important nodes across all metrics
     * @param {Object} analysisResults - Results from runFullAnalysis
     * @param {number} [k=10] - Number of top nodes to return
     * @returns {Object} Top-k nodes with scores from different algorithms
     */
    getTopKNodes(analysisResults, k = 10) {
        const nodeScores = new Map()
        
        // Collect scores from different algorithms
        if (analysisResults.kCore?.coreNumbers) {
            for (const [nodeUri, coreNumber] of analysisResults.kCore.coreNumbers) {
                if (!nodeScores.has(nodeUri)) {
                    nodeScores.set(nodeUri, {})
                }
                nodeScores.get(nodeUri).coreNumber = coreNumber
            }
        }
        
        if (analysisResults.centrality?.centrality) {
            for (const [nodeUri, centrality] of analysisResults.centrality.centrality) {
                if (!nodeScores.has(nodeUri)) {
                    nodeScores.set(nodeUri, {})
                }
                nodeScores.get(nodeUri).centrality = centrality
            }
        }
        
        // Calculate composite score
        const scoredNodes = []
        for (const [nodeUri, scores] of nodeScores) {
            const coreScore = scores.coreNumber || 0
            const centralityScore = scores.centrality || 0
            
            // Weighted composite score
            const compositeScore = coreScore * 0.6 + centralityScore * 0.4
            
            scoredNodes.push({
                nodeUri,
                compositeScore,
                coreNumber: coreScore,
                centrality: centralityScore
            })
        }
        
        // Sort by composite score and return top-k
        return scoredNodes
            .sort((a, b) => b.compositeScore - a.compositeScore)
            .slice(0, k)
    }
    
    /**
     * Export all analysis results to RDF
     * @param {Object} results - Analysis results
     * @param {Dataset} targetDataset - Target RDF dataset
     */
    exportAllResultsToRDF(results, targetDataset) {
        logger.info('Exporting all analysis results to RDF...')
        
        // Export individual algorithm results
        if (results.kCore) {
            this.graphAnalytics.exportResultsToRDF(results.kCore, targetDataset)
        }
        
        if (results.centrality) {
            this.graphAnalytics.exportResultsToRDF(results.centrality, targetDataset)
        }
        
        if (results.communities) {
            this.communityDetection.exportCommunitiesToRDF(results.communities, targetDataset)
        }
        
        logger.info('All results exported to RDF')
    }
    
    /**
     * Run HyDE hypothesis generation
     * @param {Array|string} inputs - Query strings or entity URIs
     * @param {Object} llmHandler - LLM handler instance
     * @param {Dataset} targetDataset - RDF dataset to augment
     * @param {Object} [options] - Hyde options
     * @returns {Object} Hyde generation results
     */
    async runHydeGeneration(inputs, llmHandler, targetDataset, options = {}) {
        logger.info('Running HyDE hypothesis generation...')
        
        const opts = { ...this.options, ...options }
        return await this.hyde.generateHypotheses(inputs, llmHandler, targetDataset, opts)
    }

    /**
     * Query hypothetical content from dataset
     * @param {Dataset} dataset - RDF dataset to query
     * @param {Object} [filters] - Query filters
     * @returns {Array} Hypothetical content matching filters
     */
    queryHypotheticalContent(dataset, filters = {}) {
        return this.hyde.queryHypotheticalContent(dataset, filters)
    }

    /**
     * Run entity clustering using VSOM
     * @param {Array} entities - Array of entities to cluster
     * @param {Object} embeddingHandler - Embedding handler for vector generation
     * @param {Object} [options] - VSOM options
     * @returns {Promise<Object>} Clustering results
     */
    async runEntityClustering(entities, embeddingHandler, options = {}) {
        logger.info('Running VSOM entity clustering...')
        
        const opts = { ...this.options, ...options }
        
        // Load entities into VSOM
        const loadResults = await this.vsom.loadFromEntities(entities, embeddingHandler, opts)
        
        // Train the VSOM
        const trainingResults = await this.vsom.train(opts)
        
        // Generate clusters
        const clusters = this.vsom.getClusters(opts.clusterThreshold)
        
        return {
            loadResults,
            trainingResults,
            clusters,
            nodeMappings: this.vsom.getNodeMappings(),
            topology: this.vsom.getTopology()
        }
    }

    /**
     * Run VSOM analysis on dataset
     * @param {Dataset} dataset - RDF dataset containing entities
     * @param {Object} embeddingHandler - Embedding handler for vector generation
     * @param {Object} [options] - Analysis options
     * @returns {Promise<Object>} VSOM analysis results
     */
    async runVSOMAnalysis(dataset, embeddingHandler, options = {}) {
        logger.info('Running VSOM analysis on RDF dataset...')
        
        // Extract entities from dataset
        const entities = this.extractEntitiesFromDataset(dataset)
        
        // Run clustering
        return await this.runEntityClustering(entities, embeddingHandler, options)
    }

    /**
     * Get comprehensive statistics from all algorithm modules
     * @returns {Object} Combined statistics
     */
    getAllStatistics() {
        return {
            suite: this.stats,
            graphAnalytics: this.graphAnalytics.getStatistics(),
            communityDetection: this.communityDetection.getStatistics(),
            personalizedPageRank: this.personalizedPageRank.getStatistics(),
            hyde: this.hyde.getStatistics(),
            vsom: this.vsom.getStatistics()
        }
    }
    
    /**
     * Reset all statistics
     */
    resetStatistics() {
        this.stats = {
            lastFullAnalysis: null,
            analysisCount: 0,
            totalProcessingTime: 0
        }
        
        logger.info('Algorithm statistics reset')
    }

    /**
     * Extract entities from RDF dataset (helper method)
     * @param {Dataset} dataset - RDF dataset
     * @returns {Array} Array of entities
     */
    extractEntitiesFromDataset(dataset) {
        const entities = []
        
        // Find all ragno:Entity triples
        const ragnoNS = this.namespaces?.ragno || namespace('http://purl.org/stuff/ragno/')
        const rdfNS = this.namespaces?.rdf || namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
        const rdfsNS = this.namespaces?.rdfs || namespace('http://www.w3.org/2000/01/rdf-schema#')
        
        const entityTriples = [...dataset.match(null, rdfNS('type'), ragnoNS('Entity'))]
        
        for (const triple of entityTriples) {
            const entityUri = triple.subject
            
            // Get entity properties
            const labelTriples = [...dataset.match(entityUri, rdfsNS('label'), null)]
            const contentTriples = [...dataset.match(entityUri, ragnoNS('content'), null)]
            
            const label = labelTriples[0]?.object.value || ''
            const content = contentTriples[0]?.object.value || label
            
            if (content) {
                entities.push({
                    uri: entityUri.value,
                    content: content,
                    type: 'entity',
                    metadata: {
                        fromDataset: true
                    }
                })
            }
        }
        
        logger.debug(`Extracted ${entities.length} entities from dataset`)
        return entities
    }
}

// Export individual algorithm classes for direct use
export {
    GraphAnalytics,
    CommunityDetection,
    PersonalizedPageRank,
    Hyde,
    VSOM
}