Source: services/search/SearchService.js

import logger from 'loglevel';
import faiss from 'faiss-node';
const { IndexFlatIP } = faiss;

import EmbeddingService from '../embeddings/EmbeddingService.js';
import SPARQLService from '../embeddings/SPARQLService.js';

/**
 * Service for semantic search using Faiss and embeddings
 */
class SearchService {
    /**
     * Creates a new SearchService
     * @param {Object} options - Configuration options
     * @param {EmbeddingService} options.embeddingService - The embedding service to use
     * @param {SPARQLService} options.sparqlService - The SPARQL service to use
     * @param {string} options.graphName - The graph name to search in
     * @param {number} options.dimension - The embedding dimension
     */
    constructor(options = {}) {
        this.embeddingService = options.embeddingService || new EmbeddingService();
        this.sparqlService = options.sparqlService || new SPARQLService();
        this.graphName = options.graphName || 'http://hyperdata.it/content';
        this.dimension = options.dimension || 1536;

        this.initialized = false;
        this.index = null;
        this.resources = [];
        this.resourceMap = new Map(); // Map from index to resource URI

        logger.info(`SearchService initialized with graph: ${this.graphName}`);
    }

    /**
     * Initialize the search service
     * @returns {Promise<void>}
     */
    async initialize() {
        if (this.initialized) return;

        logger.info('Initializing SearchService...');

        try {
            // Create the Faiss index
            this.index = new IndexFlatIP(this.dimension);

            // Check if the graph exists
            const graphExists = await this.sparqlService.graphExists(this.graphName);

            if (!graphExists) {
                throw new Error(`Graph ${this.graphName} does not exist or is empty`);
            }

            // Load embeddings from SPARQL store
            await this.loadEmbeddings();

            this.initialized = true;
            logger.info(`SearchService initialized with ${this.resources.length} resources`);
        } catch (error) {
            logger.error('Failed to initialize SearchService:', error);
            throw error;
        }
    }

    /**
     * Load embeddings from the SPARQL store
     * @returns {Promise<void>}
     */
    async loadEmbeddings() {
        logger.info('Loading embeddings from SPARQL store...');

        try {
            // Fetch resources with their embeddings
            const resources = await this.sparqlService.fetchResourcesWithEmbeddings(
                null, // No specific resource class
                'http://schema.org/articleBody', // Content predicate
                'http://purl.org/stuff/ragno/hasEmbedding', // Embedding predicate (Ragno namespace)
                this.graphName
            );

            logger.info(`Found ${resources.length} resources with embeddings`);

            this.resources = [];
            this.resourceMap = new Map();

            let validEmbeddings = 0;

            // Process each resource and add to the index
            resources.forEach((resource) => {
                try {
                    const uri = resource.resource.value;
                    const content = resource.content.value;
                    const embeddingStr = resource.embedding.value;

                    // Parse the embedding vector
                    const embedding = JSON.parse(embeddingStr);

                    if (Array.isArray(embedding) && embedding.length === this.dimension) {
                        // Add embedding to the index
                        this.index.add(embedding);

                        // Store resource data
                        this.resources.push({
                            uri,
                            content: this.truncateContent(content),
                            title: this.extractTitle(uri, content)
                        });

                        // Map the index to the resource
                        this.resourceMap.set(validEmbeddings, uri);
                        validEmbeddings++;
                    } else {
                        logger.warn(`Skipping resource with invalid embedding: ${uri}`);
                    }
                } catch (error) {
                    logger.error(`Error processing resource embedding: ${error.message}`);
                }
            });

            logger.info(`Added ${validEmbeddings} valid embeddings to the index`);
        } catch (error) {
            logger.error('Error loading embeddings:', error);
            throw error;
        }
    }

    /**
     * Search for resources similar to the query text
     * @param {string} queryText - The search query text
     * @param {number} limit - Maximum number of results to return
     * @returns {Promise<Array>} The search results
     */
    async search(queryText, limit = 5) {
        if (!this.initialized) {
            await this.initialize();
        }

        if (!queryText || queryText.trim().length === 0) {
            return [];
        }

        try {
            // Generate embedding for the query
            const queryEmbedding = await this.embeddingService.generateEmbedding(queryText);

            // Search the index
            const searchResults = this.index.search(queryEmbedding, limit);

            // Faiss-node returns an object with labels and distances
            const results = [];

            // Process the faiss search results
            for (let i = 0; i < searchResults.labels.length; i++) {
                const id = searchResults.labels[i];
                const score = searchResults.distances[i];

                const uri = this.resourceMap.get(id);
                const resource = this.resources.find(r => r.uri === uri);

                if (uri) {
                    // Normalize the score to be between 0 and 1
                    // Inner product can be > 1, so we'll clamp it
                    const normalizedScore = Math.min(1, Math.max(0, score));

                    results.push({
                        uri,
                        title: resource?.title || this.getFilenameFromUri(uri),
                        content: resource?.content || '',
                        score: normalizedScore // Normalized similarity score
                    });
                }
            }

            return results;
        } catch (error) {
            logger.error('Error searching resources:', error);
            throw error;
        }
    }

    /**
     * Extract a title from the content or URI
     * @param {string} uri - The resource URI
     * @param {string} content - The resource content
     * @returns {string} The extracted title
     */
    extractTitle(uri, content) {
        // Try to extract title from content (first line if it looks like a title)
        const firstLine = content.split('\n')[0].trim();
        if (firstLine.startsWith('# ')) {
            return firstLine.substring(2).trim();
        }

        // Otherwise get filename from URI
        return this.getFilenameFromUri(uri);
    }

    /**
     * Extract filename from URI
     * @param {string} uri - The resource URI
     * @returns {string} The extracted filename
     */
    getFilenameFromUri(uri) {
        const parts = uri.split('/');
        return parts[parts.length - 1];
    }

    /**
     * Truncate content for display
     * @param {string} content - The content to truncate
     * @param {number} maxLength - Maximum length
     * @returns {string} The truncated content
     */
    truncateContent(content, maxLength = 200) {
        if (!content || content.length <= maxLength) {
            return content;
        }

        return content.substring(0, maxLength) + '...';
    }
}

export default SearchService;