Source: ragno/search/VectorIndex.js

/**
 * VectorIndex.js - HNSW Vector Index for Ragno Knowledge Graphs
 * 
 * This module implements Hierarchical Navigable Small World (HNSW) indexing
 * for semantic similarity search in ragno knowledge graphs. It provides
 * efficient approximate nearest neighbor search for vector embeddings.
 * 
 * Key Features:
 * - HNSW index for fast similarity search
 * - Support for multiple embedding dimensions
 * - Type-aware indexing (by ragno ontology types)
 * - Batch insertion and search operations
 * - Integration with ragno RDF elements
 * - Persistence and loading capabilities
 */

import pkg from 'hnswlib-node'
const { HierarchicalNSW } = pkg
import rdf from 'rdf-ext'
import { logger } from '../../Utils.js'

export default class VectorIndex {
    constructor(options = {}) {
        this.options = {
            dimension: options.dimension || 1536, // Default OpenAI embedding size
            maxElements: options.maxElements || 100000,
            efConstruction: options.efConstruction || 200,
            mMax: options.mMax || 16,
            efSearch: options.efSearch || 100,
            seed: options.seed || 100,
            ...options
        }

        // Initialize HNSW index
        this.index = new HierarchicalNSW('cosine', this.options.dimension)
        this.index.initIndex(this.options.maxElements, this.options.mMax, this.options.efConstruction, this.options.seed)
        this.index.setEf(this.options.efSearch)

        // Metadata storage
        this.nodeMetadata = new Map() // nodeId -> { uri, type, content, embedding }
        this.uriToNodeId = new Map() // uri -> nodeId
        this.typeToNodes = new Map() // type -> Set of nodeIds
        this.nextNodeId = 0

        // Index statistics
        this.stats = {
            totalNodes: 0,
            nodesByType: new Map(),
            lastIndexTime: null,
            searchCount: 0,
            averageSearchTime: 0
        }

        logger.debug(`VectorIndex initialized: ${this.options.dimension}D, max ${this.options.maxElements} elements`)
    }

    /**
     * Add a node with its embedding to the index
     * @param {string} uri - Node URI
     * @param {Array<number>} embedding - Vector embedding
     * @param {Object} metadata - Node metadata
     * @returns {number} Node ID in index
     */
    addNode(uri, embedding, metadata = {}) {
        if (this.uriToNodeId.has(uri)) {
            logger.warn(`Node ${uri} already exists in index`)
            return this.uriToNodeId.get(uri)
        }

        if (embedding.length !== this.options.dimension) {
            throw new Error(`Embedding dimension ${embedding.length} does not match index dimension ${this.options.dimension}`)
        }

        const nodeId = this.nextNodeId++

        // Add to HNSW index
        this.index.addPoint(embedding, nodeId)

        // Store metadata
        const nodeMetadata = {
            uri,
            type: metadata.type || 'unknown',
            content: metadata.content || '',
            embedding,
            timestamp: new Date(),
            ...metadata
        }

        this.nodeMetadata.set(nodeId, nodeMetadata)
        this.uriToNodeId.set(uri, nodeId)

        // Update type-based grouping
        const nodeType = nodeMetadata.type
        if (!this.typeToNodes.has(nodeType)) {
            this.typeToNodes.set(nodeType, new Set())
        }
        this.typeToNodes.get(nodeType).add(nodeId)

        // Update statistics
        this.stats.totalNodes++
        const typeCount = this.stats.nodesByType.get(nodeType) || 0
        this.stats.nodesByType.set(nodeType, typeCount + 1)
        this.stats.lastIndexTime = new Date()

        logger.debug(`Added node ${uri} as ID ${nodeId}, type: ${nodeType}`)
        return nodeId
    }

    /**
     * Add multiple nodes in batch
     * @param {Array} nodes - Array of {uri, embedding, metadata} objects
     * @returns {Array<number>} Array of node IDs
     */
    addNodesBatch(nodes) {
        logger.info(`Adding ${nodes.length} nodes to vector index...`)

        const nodeIds = []
        for (const node of nodes) {
            try {
                const nodeId = this.addNode(node.uri, node.embedding, node.metadata)
                nodeIds.push(nodeId)
            } catch (error) {
                logger.error(`Failed to add node ${node.uri}:`, error.message)
            }
        }

        logger.info(`Successfully added ${nodeIds.length}/${nodes.length} nodes to index`)
        return nodeIds
    }

    /**
     * Search for similar nodes
     * @param {Array<number>} queryEmbedding - Query vector
     * @param {number} [k=10] - Number of results to return
     * @param {Object} [options] - Search options
     * @returns {Array} Search results with scores and metadata
     */
    search(queryEmbedding, k = 10, options = {}) {
        const startTime = Date.now()

        if (queryEmbedding.length !== this.options.dimension) {
            throw new Error(`Query embedding dimension ${queryEmbedding.length} does not match index dimension ${this.options.dimension}`)
        }

        if (this.stats.totalNodes === 0) {
            logger.warn('Vector index is empty')
            return []
        }

        // Set search parameters
        const originalEf = this.index.getEf()
        if (options.efSearch) {
            this.index.setEf(options.efSearch)
        }

        try {
            // Perform HNSW search
            const results = this.index.searchKnn(queryEmbedding, k)

            // Enhance results with metadata
            const enhancedResults = []
            for (let i = 0; i < results.neighbors.length; i++) {
                const nodeId = results.neighbors[i]
                const distance = results.distances[i]
                const similarity = 1 - distance // Convert distance to similarity

                const metadata = this.nodeMetadata.get(nodeId)
                if (metadata) {
                    enhancedResults.push({
                        nodeId,
                        uri: metadata.uri,
                        type: metadata.type,
                        content: metadata.content,
                        similarity,
                        distance,
                        metadata: {
                            ...metadata,
                            embedding: undefined // Don't include full embedding in results
                        }
                    })
                }
            }

            // Filter by type if specified
            let filteredResults = enhancedResults
            if (options.filterByTypes && options.filterByTypes.length > 0) {
                filteredResults = enhancedResults.filter(result =>
                    options.filterByTypes.includes(result.type)
                )
            }

            // Update statistics
            const searchTime = Date.now() - startTime
            this.stats.searchCount++
            this.stats.averageSearchTime = (this.stats.averageSearchTime * (this.stats.searchCount - 1) + searchTime) / this.stats.searchCount

            logger.debug(`Vector search completed: ${filteredResults.length} results in ${searchTime}ms`)
            return filteredResults

        } finally {
            // Restore original ef parameter
            this.index.setEf(originalEf)
        }
    }

    /**
     * Search within specific ragno types
     * @param {Array<number>} queryEmbedding - Query vector
     * @param {Array<string>} types - Ragno types to search within
     * @param {number} [k=10] - Number of results per type
     * @returns {Object} Results grouped by type
     */
    searchByTypes(queryEmbedding, types, k = 10) {
        const resultsByType = new Map()

        for (const type of types) {
            const typeResults = this.search(queryEmbedding, k * 2, { // Get more to account for filtering
                filterByTypes: [type]
            })

            resultsByType.set(type, typeResults.slice(0, k))
        }

        return Object.fromEntries(resultsByType)
    }

    /**
     * Find nodes similar to a given node in the index
     * @param {string} uri - URI of the reference node
     * @param {number} [k=10] - Number of similar nodes to return
     * @param {Object} [options] - Search options
     * @returns {Array} Similar nodes (excluding the reference node)
     */
    findSimilarNodes(uri, k = 10, options = {}) {
        const nodeId = this.uriToNodeId.get(uri)
        if (!nodeId) {
            throw new Error(`Node ${uri} not found in vector index`)
        }

        const metadata = this.nodeMetadata.get(nodeId)
        if (!metadata || !metadata.embedding) {
            throw new Error(`No embedding found for node ${uri}`)
        }

        // Search for similar nodes
        const results = this.search(metadata.embedding, k + 1, options) // +1 to account for self

        // Filter out the reference node itself
        return results.filter(result => result.uri !== uri)
    }

    /**
     * Get nodes by type
     * @param {string} type - Ragno type
     * @param {number} [limit] - Maximum number of nodes to return
     * @returns {Array} Nodes of the specified type
     */
    getNodesByType(type, limit) {
        const nodeIds = this.typeToNodes.get(type)
        if (!nodeIds) {
            return []
        }

        const nodes = []
        let count = 0

        for (const nodeId of nodeIds) {
            if (limit && count >= limit) break

            const metadata = this.nodeMetadata.get(nodeId)
            if (metadata) {
                nodes.push({
                    nodeId,
                    uri: metadata.uri,
                    type: metadata.type,
                    content: metadata.content,
                    metadata: {
                        ...metadata,
                        embedding: undefined
                    }
                })
                count++
            }
        }

        return nodes
    }

    /**
     * Remove a node from the index
     * @param {string} uri - URI of the node to remove
     * @returns {boolean} True if node was removed
     */
    removeNode(uri) {
        const nodeId = this.uriToNodeId.get(uri)
        if (!nodeId) {
            logger.warn(`Node ${uri} not found in index`)
            return false
        }

        const metadata = this.nodeMetadata.get(nodeId)
        if (metadata) {
            // Remove from type grouping
            const nodeType = metadata.type
            if (this.typeToNodes.has(nodeType)) {
                this.typeToNodes.get(nodeType).delete(nodeId)
                if (this.typeToNodes.get(nodeType).size === 0) {
                    this.typeToNodes.delete(nodeType)
                }
            }

            // Update statistics
            this.stats.totalNodes--
            const typeCount = this.stats.nodesByType.get(nodeType) || 0
            if (typeCount > 1) {
                this.stats.nodesByType.set(nodeType, typeCount - 1)
            } else {
                this.stats.nodesByType.delete(nodeType)
            }
        }

        // Remove from metadata storage
        this.nodeMetadata.delete(nodeId)
        this.uriToNodeId.delete(uri)

        // Note: HNSW doesn't support deletion, so the point remains in the index
        // This is a limitation we need to document

        logger.debug(`Removed node ${uri} from metadata (HNSW point remains)`)
        return true
    }

    /**
     * Check if a node exists in the index
     * @param {string} uri - Node URI
     * @returns {boolean} True if node exists
     */
    hasNode(uri) {
        return this.uriToNodeId.has(uri)
    }

    /**
     * Get metadata for a node
     * @param {string} uri - Node URI
     * @returns {Object|null} Node metadata
     */
    getNodeMetadata(uri) {
        const nodeId = this.uriToNodeId.get(uri)
        if (!nodeId) {
            return null
        }

        const metadata = this.nodeMetadata.get(nodeId)
        return metadata ? { ...metadata, embedding: undefined } : null
    }

    /**
     * Save index to file
     * @param {string} indexPath - Path to save HNSW index
     * @param {string} metadataPath - Path to save metadata
     */
    async saveIndex(indexPath, metadataPath) {
        logger.info(`Saving vector index to ${indexPath}...`)

        try {
            // Save HNSW index
            this.index.writeIndex(indexPath)

            // Save metadata
            const metadataObj = {
                options: this.options,
                nodeMetadata: Object.fromEntries(this.nodeMetadata),
                uriToNodeId: Object.fromEntries(this.uriToNodeId),
                typeToNodes: Object.fromEntries(
                    Array.from(this.typeToNodes.entries()).map(([type, nodeSet]) => [type, Array.from(nodeSet)])
                ),
                nextNodeId: this.nextNodeId,
                stats: this.stats
            }

            await require('fs/promises').writeFile(metadataPath, JSON.stringify(metadataObj, null, 2))

            logger.info('Vector index saved successfully')
        } catch (error) {
            logger.error('Failed to save vector index:', error)
            throw error
        }
    }

    /**
     * Load index from file
     * @param {string} indexPath - Path to HNSW index file
     * @param {string} metadataPath - Path to metadata file
     */
    async loadIndex(indexPath, metadataPath) {
        logger.info(`Loading vector index from ${indexPath}...`)

        try {
            // Load HNSW index
            this.index.readIndex(indexPath)

            // Load metadata
            const metadataContent = await require('fs/promises').readFile(metadataPath, 'utf-8')
            const metadataObj = JSON.parse(metadataContent)

            // Restore state
            this.options = { ...this.options, ...metadataObj.options }
            this.nodeMetadata = new Map(Object.entries(metadataObj.nodeMetadata).map(([k, v]) => [parseInt(k), v]))
            this.uriToNodeId = new Map(Object.entries(metadataObj.uriToNodeId))
            this.typeToNodes = new Map(
                Object.entries(metadataObj.typeToNodes).map(([type, nodeArray]) => [type, new Set(nodeArray)])
            )
            this.nextNodeId = metadataObj.nextNodeId
            this.stats = metadataObj.stats

            logger.info(`Vector index loaded: ${this.stats.totalNodes} nodes`)
        } catch (error) {
            logger.error('Failed to load vector index:', error)
            throw error
        }
    }

    /**
     * Clear the entire index
     */
    clear() {
        this.index = new HierarchicalNSW('cosine', this.options.dimension)
        this.index.initIndex(this.options.maxElements, this.options.mMax, this.options.efConstruction, this.options.seed)
        this.index.setEf(this.options.efSearch)

        this.nodeMetadata.clear()
        this.uriToNodeId.clear()
        this.typeToNodes.clear()
        this.nextNodeId = 0

        this.stats = {
            totalNodes: 0,
            nodesByType: new Map(),
            lastIndexTime: null,
            searchCount: 0,
            averageSearchTime: 0
        }

        logger.info('Vector index cleared')
    }

    /**
     * Get index statistics
     * @returns {Object} Index statistics
     */
    getStatistics() {
        return {
            ...this.stats,
            nodesByType: Object.fromEntries(this.stats.nodesByType),
            availableTypes: Array.from(this.typeToNodes.keys()),
            indexSize: this.stats.totalNodes,
            dimension: this.options.dimension,
            maxElements: this.options.maxElements
        }
    }

    /**
     * Optimize index performance
     * @param {Object} [options] - Optimization options
     */
    optimizeIndex(options = {}) {
        logger.info('Optimizing vector index...')

        // Adjust ef parameter based on index size
        const optimalEf = Math.max(this.options.efSearch, Math.min(200, this.stats.totalNodes / 10))
        this.index.setEf(optimalEf)

        logger.info(`Index optimized: ef set to ${optimalEf}`)
    }
}