JSDoc: Source: ragno/algorithms/Hyde.js

/**
 * Hyde.js - Hypothetical Document Embeddings for Ragno Knowledge Graphs
 * 
 * The HyDE (Hypothetical Document Embeddings) algorithm enhances retrieval for large 
 * language models (LLMs) by first using an LLM to generate a hypothetical answer or 
 * document in response to a user query. This synthetic document is then embedded into 
 * a vector space, and the resulting embedding is used to search for semantically similar 
 * real-world documents in a vector database.
 * 
 * This implementation integrates HyDE with the Ragno knowledge graph system by:
 * - Generating hypothetical answers using existing LLM facilities
 * - Adding these answers to the RDF graph with the ragno:maybe property
 * - Creating entity relationships based on the hypothetical content
 * - Supporting multiple input sources (queries, entities, semantic units)
 * 
 * Key Features:
 * - Query-to-hypothetical-answer generation
 * - RDF integration with ragno:maybe property annotation
 * - Vector embedding of hypothetical content
 * - Entity extraction from generated content
 * - Graph augmentation with uncertainty markers
 */

import rdf from 'rdf-ext'
import { logger } from '../../Utils.js'
import NamespaceManager from '../core/NamespaceManager.js'
import Entity from '../Entity.js'
import SemanticUnit from '../SemanticUnit.js'
import Relationship from '../Relationship.js'

export default class Hyde {
    constructor(options = {}) {
        this.options = {
            // LLM options for hypothesis generation
            maxTokens: options.maxTokens || 512,
            temperature: options.temperature || 0.7,
            model: options.model || 'default',

            // HyDE-specific options
            hypothesesPerQuery: options.hypothesesPerQuery || 3,
            includeOriginalQuery: options.includeOriginalQuery || true,
            confidenceThreshold: options.confidenceThreshold || 0.5,

            // Entity extraction options
            extractEntities: options.extractEntities !== false,
            maxEntitiesPerHypothesis: options.maxEntitiesPerHypothesis || 10,

            // RDF options
            uriBase: options.uriBase || 'http://example.org/ragno/',
            preserveProvenance: options.preserveProvenance !== false,

            ...options
        }

        this.namespaces = new NamespaceManager({ uriBase: this.options.uriBase })

        this.stats = {
            totalQueries: 0,
            totalHypotheses: 0,
            totalEntitiesExtracted: 0,
            totalExecutionTime: 0,
            lastRun: null
        }

        logger.debug('Hyde algorithm initialized')
    }

    /**
     * Generate hypothetical answers and augment RDF graph
     * @param {Array|string} inputs - Query strings or entity URIs to generate hypotheses for
     * @param {Object} llmHandler - LLM handler instance for generation
     * @param {Dataset} targetDataset - RDF dataset to augment
     * @param {Object} [options] - Generation options
     * @returns {Object} Results with generated hypotheses and RDF updates
     */
    async generateHypotheses(inputs, llmHandler, targetDataset, options = {}) {
        const startTime = Date.now()
        logger.info(`Starting HyDE generation for ${Array.isArray(inputs) ? inputs.length : 1} input(s)`)

        const opts = { ...this.options, ...options }
        const inputArray = Array.isArray(inputs) ? inputs : [inputs]

        const results = {
            timestamp: new Date(),
            queries: inputArray,
            hypotheses: [],
            entities: [],
            relationships: [],
            rdfTriples: 0,
            processingTime: 0
        }

        try {
            for (const input of inputArray) {
                const queryResults = await this.processQuery(input, llmHandler, targetDataset, opts)
                results.hypotheses.push(...queryResults.hypotheses)
                results.entities.push(...queryResults.entities)
                results.relationships.push(...queryResults.relationships)
                results.rdfTriples += queryResults.rdfTriples
            }

            const endTime = Date.now()
            results.processingTime = endTime - startTime

            // Update statistics
            this.stats.totalQueries += inputArray.length
            this.stats.totalHypotheses += results.hypotheses.length
            this.stats.totalEntitiesExtracted += results.entities.length
            this.stats.totalExecutionTime += results.processingTime
            this.stats.lastRun = new Date()

            logger.info(`HyDE generation completed in ${results.processingTime}ms`)
            logger.info(`Generated ${results.hypotheses.length} hypotheses, ${results.entities.length} entities`)

            return results

        } catch (error) {
            logger.error('Error during HyDE generation:', error)
            throw error
        }
    }

    /**
     * Process a single query or input to generate hypotheses
     * @param {string} input - Query string or entity URI
     * @param {Object} llmHandler - LLM handler instance
     * @param {Dataset} targetDataset - RDF dataset to augment
     * @param {Object} options - Processing options
     * @returns {Object} Processing results
     */
    async processQuery(input, llmHandler, targetDataset, options) {
        logger.debug(`Processing input: ${input.substring(0, 100)}...`)

        const results = {
            input,
            hypotheses: [],
            entities: [],
            relationships: [],
            rdfTriples: 0
        }

        // Generate multiple hypotheses for the input
        for (let i = 0; i < options.hypothesesPerQuery; i++) {
            try {
                const hypothesis = await this.generateSingleHypothesis(input, llmHandler, options, i)

                if (hypothesis && hypothesis.content) {
                    // Create hypothesis semantic unit
                    const hypothesisUnit = this.createHypothesisUnit(hypothesis, input, i)
                    results.hypotheses.push(hypothesisUnit)

                    // Extract entities from hypothesis if enabled
                    if (options.extractEntities) {
                        const extractedEntities = await this.extractEntitiesFromHypothesis(
                            hypothesis, llmHandler, options
                        )
                        results.entities.push(...extractedEntities)

                        // Create relationships between query and hypothesis entities
                        const relationships = this.createHypothesisRelationships(
                            input, hypothesisUnit, extractedEntities
                        )
                        results.relationships.push(...relationships)
                    }

                    // Add to RDF dataset
                    const triplesAdded = this.addHypothesisToRDF(
                        hypothesisUnit, results.entities, results.relationships, targetDataset
                    )
                    results.rdfTriples += triplesAdded
                }

            } catch (error) {
                logger.warn(`Failed to generate hypothesis ${i + 1} for input: ${error.message}`)
            }
        }

        return results
    }

    /**
     * Generate a single hypothesis using the LLM
     * @param {string} input - Input query or entity URI
     * @param {Object} llmHandler - LLM handler instance
     * @param {Object} options - Generation options
     * @param {number} index - Hypothesis index for variation
     * @returns {Object} Generated hypothesis
     */
    async generateSingleHypothesis(input, llmHandler, options, index) {
        // Create varied prompts for different hypotheses
        const prompt = this.createHypothesisPrompt(input, index, options)

        const llmOptions = {
            model: options.model || 'qwen2:1.5b', // Ensure we have a default model
            maxTokens: options.maxTokens,
            temperature: (options.temperature || 0.7) + (index * 0.1) // Vary temperature for diversity
        }

        logger.debug(`Generating hypothesis ${index + 1} with prompt: ${prompt.substring(0, 100)}...`)
        logger.debug(`Using model: ${llmOptions.model}`)

        let response
        try {
            response = await llmHandler.generateResponse(prompt, '', llmOptions)
            logger.debug(`Raw LLM response type: ${typeof response}, length: ${response?.length || 'N/A'}`)
        } catch (error) {
            logger.error(`LLM generateResponse failed: ${error.message}`)
            throw error
        }

        if (!response || typeof response !== 'string') {
            logger.warn(`Invalid response from LLM: ${typeof response}, response: ${JSON.stringify(response)}`)
            throw new Error(`Invalid response from LLM: ${typeof response}`)
        }

        let confidence
        try {
            logger.debug(`About to calculate confidence for response type: ${typeof response}, input type: ${typeof input}`)
            confidence = this.estimateConfidence(response, input)
            logger.debug(`Confidence calculation successful: ${confidence}`)
        } catch (confError) {
            logger.error(`Confidence estimation failed: ${confError.message}`)
            logger.error(`Response: ${response}, Input: ${input}`)
            confidence = 0.1 // Fallback confidence
        }

        const hypothesis = {
            content: response,
            prompt,
            index,
            confidence: confidence,
            timestamp: new Date()
        }

        logger.debug(`Generated hypothesis with confidence: ${hypothesis.confidence}`)
        return hypothesis
    }

    /**
     * Create a prompt for hypothesis generation
     * @param {string} input - Input query or entity URI
     * @param {number} index - Hypothesis index for variation
     * @param {Object} options - Options for prompt creation
     * @returns {string} Generated prompt
     */
    createHypothesisPrompt(input, index, options) {
        const variations = [
            `Provide a comprehensive answer to the following question or topic: ${input}`,
            `Generate a detailed explanation or response about: ${input}`,
            `Create an informative document that addresses: ${input}`,
            `Write a knowledgeable response to: ${input}`,
            `Provide insights and information about: ${input}`
        ]

        const basePrompt = variations[index % variations.length]

        return `${basePrompt}

Please provide a well-structured, informative response that could serve as a hypothetical document for information retrieval. Focus on being comprehensive and accurate while maintaining clarity.`
    }

    /**
     * Estimate confidence score for a generated hypothesis
     * @param {string} hypothesis - Generated hypothesis text
     * @param {string} originalInput - Original input query
     * @returns {number} Confidence score between 0 and 1
     */
    estimateConfidence(hypothesis, originalInput) {
        // Handle undefined or invalid hypothesis
        if (!hypothesis || typeof hypothesis !== 'string') {
            logger.warn(`Invalid hypothesis for confidence estimation: ${typeof hypothesis}`)
            return 0.1 // Very low confidence for invalid content
        }

        // More nuanced confidence estimation with lower base and stricter criteria
        let confidence = 0.3 // Lower base confidence

        // Length-based factors (more discriminating)
        if (hypothesis.length > 200) confidence += 0.05
        if (hypothesis.length > 500) confidence += 0.05
        if (hypothesis.length > 1000) confidence += 0.05

        // Structure-based factors (more demanding)
        const sentences = hypothesis.split(/[.!?]+/).filter(s => s.trim().length > 10)
        if (sentences.length >= 3) confidence += 0.1 // Multiple sentences
        if (hypothesis.includes(':') && hypothesis.includes(';')) confidence += 0.05 // Complex punctuation

        // Word count and complexity
        const words = hypothesis.split(/\s+/)
        if (words.length > 100) confidence += 0.1
        if (words.length > 200) confidence += 0.05

        // Content relevance (keyword overlap with better weighting)
        const inputWords = originalInput.toLowerCase().split(/\s+/).filter(w => w.length > 3)
        const hypothesisWords = hypothesis.toLowerCase().split(/\s+/)
        const overlap = inputWords.filter(word => hypothesisWords.includes(word)).length
        const relevanceScore = inputWords.length > 0 ? overlap / inputWords.length : 0
        confidence += relevanceScore * 0.25 // Up to 0.25 for perfect relevance

        // Quality indicators
        if (hypothesis.includes('however') || hypothesis.includes('therefore') || hypothesis.includes('furthermore')) {
            confidence += 0.05 // Sophisticated connectors
        }

        // Penalize very short or generic responses
        if (hypothesis.length < 100) confidence -= 0.2
        if (words.length < 20) confidence -= 0.2

        // Add some randomness to prevent all hypotheses having identical confidence
        const variation = (Math.random() - 0.5) * 0.1 // ±0.05 random variation
        confidence += variation

        return Math.max(0.1, Math.min(confidence, 0.95)) // Cap between 0.1 and 0.95
    }

    /**
     * Create a SemanticUnit for a hypothesis
     * @param {Object} hypothesis - Generated hypothesis object
     * @param {string} originalInput - Original input query
     * @param {number} index - Hypothesis index
     * @returns {SemanticUnit} Hypothesis semantic unit
     */
    createHypothesisUnit(hypothesis, originalInput, index) {
        const unitId = `hypothesis-${Date.now()}-${index}`
        const uri = this.namespaces.ex(unitId)

        return new SemanticUnit({
            uri: uri.value,
            content: hypothesis.content,
            type: 'hypothesis',
            metadata: {
                originalQuery: originalInput,
                confidence: hypothesis.confidence,
                generationIndex: index,
                timestamp: hypothesis.timestamp,
                hypothetical: true
            },
            namespaces: this.namespaces
        })
    }

    /**
     * Extract entities from a generated hypothesis
     * @param {Object} hypothesis - Generated hypothesis object
     * @param {Object} llmHandler - LLM handler instance
     * @param {Object} options - Extraction options
     * @returns {Array} Extracted entities
     */
    async extractEntitiesFromHypothesis(hypothesis, llmHandler, options) {
        if (!options.extractEntities) return []

        try {
            const extractionPrompt = `Extract the main entities, concepts, and topics from the following text. Return them as a simple list, one per line:

${hypothesis.content}

Entities:`

            const response = await llmHandler.generateResponse(extractionPrompt, '', {
                model: options.model || 'qwen2:1.5b',
                maxTokens: 200,
                temperature: 0.3
            })

            const entityNames = response
                .split('\n')
                .filter(line => line.trim())
                .map(line => line.trim().replace(/^-\s*/, ''))
                .slice(0, options.maxEntitiesPerHypothesis)

            return entityNames.map((name, index) => {
                const entityId = `hypothesis-entity-${Date.now()}-${index}`
                const uri = this.namespaces.ex(entityId)

                return new Entity({
                    uri: uri.value,
                    name,
                    isEntryPoint: false,
                    subType: 'hypothetical-entity',
                    metadata: {
                        extractedFrom: 'hypothesis',
                        confidence: (hypothesis.confidence || 0.5) * 0.8, // Reduce confidence for extracted entities
                        hypothetical: true
                    },
                    namespaces: this.namespaces
                })
            })

        } catch (error) {
            logger.warn(`Failed to extract entities from hypothesis: ${error.message}`)
            return []
        }
    }

    /**
     * Create relationships between query, hypothesis, and extracted entities
     * @param {string} originalInput - Original input query
     * @param {SemanticUnit} hypothesisUnit - Hypothesis semantic unit
     * @param {Array} entities - Extracted entities
     * @returns {Array} Created relationships
     */
    createHypothesisRelationships(originalInput, hypothesisUnit, entities) {
        const relationships = []

        // Create relationship between query and hypothesis
        const queryHypothesisId = `query-hypothesis-${Date.now()}`
        const queryHypothesisUri = this.namespaces.ex(queryHypothesisId)

        relationships.push(new Relationship({
            uri: queryHypothesisUri.value,
            subject: originalInput,
            predicate: 'hypothetical-answer',
            object: hypothesisUnit.uri,
            metadata: {
                type: 'hypothesis-generation',
                confidence: hypothesisUnit.metadata?.confidence || 0.5,
                hypothetical: true
            },
            namespaces: this.namespaces
        }))

        // Create relationships between hypothesis and extracted entities
        entities.forEach((entity, index) => {
            const relationshipId = `hypothesis-entity-${Date.now()}-${index}`
            const relationshipUri = this.namespaces.ex(relationshipId)

            relationships.push(new Relationship({
                uri: relationshipUri.value,
                subject: hypothesisUnit.uri,
                predicate: 'mentions',
                object: entity.uri,
                metadata: {
                    type: 'entity-mention',
                    confidence: entity.metadata?.confidence || 0.5,
                    hypothetical: true
                },
                namespaces: this.namespaces
            }))
        })

        return relationships
    }

    /**
     * Add hypothesis and related data to RDF dataset with ragno:maybe property
     * @param {SemanticUnit} hypothesisUnit - Hypothesis semantic unit
     * @param {Array} entities - Extracted entities
     * @param {Array} relationships - Created relationships
     * @param {Dataset} targetDataset - Target RDF dataset
     * @returns {number} Number of triples added
     */
    addHypothesisToRDF(hypothesisUnit, entities, relationships, targetDataset) {
        let triplesAdded = 0

        //  logger.debug(`addHypothesisToRDF called with hypothesisUnit: ${hypothesisUnit ? 'defined' : 'undefined'}`)
        logger.debug('hypothesis Unit added to RDF')
        //   logger.debug(`targetDataset: ${targetDataset ? 'defined' : 'undefined'}`)

        // Export hypothesis unit to dataset
        hypothesisUnit.exportToDataset(targetDataset)
        triplesAdded += hypothesisUnit.getTriples().length || 0

        // Add ragno:maybe property to mark as hypothetical
        const hypothesisNode = rdf.namedNode(hypothesisUnit.uri)
        const maybeQuad = rdf.quad(
            hypothesisNode,
            this.namespaces.ragno('maybe'),
            rdf.literal('true', this.namespaces.xsd('boolean'))
        )
        targetDataset.add(maybeQuad)
        triplesAdded++

        // Add confidence score
        const confidence = hypothesisUnit.metadata?.confidence || 0.5
        const confidenceQuad = rdf.quad(
            hypothesisNode,
            this.namespaces.ragno('confidence'),
            rdf.literal(confidence.toString(), this.namespaces.xsd('decimal'))
        )
        targetDataset.add(confidenceQuad)
        triplesAdded++

        // Export entities
        entities.forEach(entity => {
            entity.exportToDataset(targetDataset)
            triplesAdded += entity.getTriples().length || 0

            // Mark entity as hypothetical
            const entityNode = rdf.namedNode(entity.uri)
            const entityMaybeQuad = rdf.quad(
                entityNode,
                this.namespaces.ragno('maybe'),
                rdf.literal('true', this.namespaces.xsd('boolean'))
            )
            targetDataset.add(entityMaybeQuad)
            triplesAdded++
        })

        // Export relationships
        relationships.forEach(relationship => {
            relationship.exportToDataset(targetDataset)
            triplesAdded += relationship.getTriples().length || 0

            // Mark relationship as hypothetical
            const relationshipNode = rdf.namedNode(relationship.uri)
            const relationshipMaybeQuad = rdf.quad(
                relationshipNode,
                this.namespaces.ragno('maybe'),
                rdf.literal('true', this.namespaces.xsd('boolean'))
            )
            targetDataset.add(relationshipMaybeQuad)
            triplesAdded++
        })

        return triplesAdded
    }

    /**
     * Query hypothetical content from RDF dataset
     * @param {Dataset} dataset - RDF dataset to query
     * @param {Object} [filters] - Query filters
     * @returns {Array} Hypothetical content matching filters
     */
    queryHypotheticalContent(dataset, filters = {}) {
        const results = []

        // Find all triples with ragno:maybe = true
        const maybeProperty = this.namespaces.ragno('maybe')
        const trueValue = rdf.literal('true', this.namespaces.xsd('boolean'))

        for (const quad of dataset.match(null, maybeProperty, trueValue)) {
            const subject = quad.subject

            // Get all properties of this hypothetical entity/unit
            const properties = {}
            for (const propQuad of dataset.match(subject, null, null)) {
                const predicate = propQuad.predicate.value
                const object = propQuad.object

                if (!properties[predicate]) {
                    properties[predicate] = []
                }
                properties[predicate].push(object.value)
            }

            // Apply filters if specified
            if (this.matchesFilters(properties, filters)) {
                results.push({
                    uri: subject.value,
                    properties,
                    hypothetical: true
                })
            }
        }

        return results
    }

    /**
     * Check if properties match specified filters
     * @param {Object} properties - Entity properties
     * @param {Object} filters - Filter criteria
     * @returns {boolean} Whether properties match filters
     */
    matchesFilters(properties, filters) {
        for (const [filterKey, filterValue] of Object.entries(filters)) {
            if (!properties[filterKey] || !properties[filterKey].includes(filterValue)) {
                return false
            }
        }
        return true
    }

    /**
     * Get algorithm statistics
     * @returns {Object} Algorithm statistics
     */
    getStatistics() {
        return {
            ...this.stats,
            averageExecutionTime: this.stats.totalQueries > 0
                ? this.stats.totalExecutionTime / this.stats.totalQueries
                : 0,
            averageHypothesesPerQuery: this.stats.totalQueries > 0
                ? this.stats.totalHypotheses / this.stats.totalQueries
                : 0,
            averageEntitiesPerQuery: this.stats.totalQueries > 0
                ? this.stats.totalEntitiesExtracted / this.stats.totalQueries
                : 0
        }
    }

    /**
     * Reset algorithm statistics
     */
    resetStatistics() {
        this.stats = {
            totalQueries: 0,
            totalHypotheses: 0,
            totalEntitiesExtracted: 0,
            totalExecutionTime: 0,
            lastRun: null
        }

        logger.info('Hyde algorithm statistics reset')
    }
}