Source: aux/wikidata/WikidataResearcher.js

/**
 * WikidataResearcher - Orchestrate Wikidata research workflows
 * 
 * This component provides a simplified, standardized API for conducting
 * Wikidata research. It encapsulates the complex workflow of concept extraction,
 * entity searching, RDF conversion, and knowledge graph storage.
 * 
 * API: executeResearch(input, resources, options)
 */

import logger from 'loglevel';
import WikidataSearch from './WikidataSearch.js';
import WikidataToRagno from './WikidataToRagno.js';
import QueryTemplateManager from './QueryTemplateManager.js';

export default class WikidataResearcher {
    constructor() {
        this.stats = {
            totalResearches: 0,
            conceptsExtracted: 0,
            entitiesFound: 0,
            entitiesConverted: 0,
            researchSessions: []
        };
    }

    /**
     * Execute complete Wikidata research workflow
     * 
     * @param {Object} input - Research input data
     * @param {string} input.question - Research question or text to analyze
     * @param {Array<string>} input.concepts - Pre-extracted concepts (optional)
     * @param {Object} resources - External dependencies
     * @param {Object} resources.llmHandler - LLM handler for concept extraction
     * @param {Object} resources.sparqlHelper - SPARQL helper for storage operations
     * @param {Object} resources.config - Configuration object with graph URIs
     * @param {Object} options - Configuration options
     * @param {number} options.maxEntitiesPerConcept - Max entities per concept (default: 3)
     * @param {number} options.maxWikidataSearchResults - Max search results (default: 15)
     * @param {number} options.minEntityConfidence - Min confidence threshold (default: 0.4)
     * @param {boolean} options.enableHierarchySearch - Enable hierarchy exploration (default: true)
     * @param {boolean} options.storeResults - Store results in knowledge graph (default: true)
     * @returns {Promise<Object>} Research results with entities and metadata
     */
    async executeResearch(input, resources, options = {}) {
        const startTime = Date.now();
        
        try {
            const { question, concepts: preExtractedConcepts } = input;
            const { llmHandler, sparqlHelper, config } = resources;
            
            const researchConfig = {
                maxEntitiesPerConcept: options.maxEntitiesPerConcept || 3,
                maxWikidataSearchResults: options.maxWikidataSearchResults || 15,
                minEntityConfidence: options.minEntityConfidence || 0.4,
                enableHierarchySearch: options.enableHierarchySearch !== false,
                storeResults: options.storeResults !== false,
                storageGraph: options.storageGraph || config.wikidataGraphURI || 'http://purl.org/stuff/wikidata/research',
                ...options
            };

            this.stats.totalResearches++;

            // Step 1: Extract concepts from the question (if not provided)
            let concepts = preExtractedConcepts;
            if (!concepts && llmHandler) {
                const conceptResult = await this._extractConcepts(question, llmHandler, researchConfig);
                if (conceptResult.success) {
                    concepts = conceptResult.concepts;
                    this.stats.conceptsExtracted += concepts.length;
                }
            }

            if (!concepts || concepts.length === 0) {
                // Fallback: use question text directly
                concepts = [question];
            }

            // Step 2: Search Wikidata for entities
            const searchResult = await this._searchWikidataEntities(concepts, researchConfig);
            
            if (!searchResult.success) {
                return {
                    success: false,
                    error: searchResult.error,
                    question,
                    concepts: concepts || [],
                    ragnoEntities: [],
                    metadata: {
                        researchDuration: Date.now() - startTime,
                        timestamp: new Date().toISOString()
                    }
                };
            }

            // Step 3: Convert to Ragno format
            const conversionResult = await this._convertToRagno(
                searchResult.entities, 
                question, 
                researchConfig
            );

            // Step 4: Store results (if enabled)
            let storageResult = null;
            if (researchConfig.storeResults && sparqlHelper) {
                storageResult = await this._storeResults(
                    conversionResult.ragnoEntities,
                    question,
                    concepts,
                    sparqlHelper,
                    researchConfig
                );
            }

            // Update statistics
            this.stats.entitiesFound += searchResult.entities.length;
            this.stats.entitiesConverted += conversionResult.ragnoEntities.length;

            // Record session
            const sessionData = {
                question,
                concepts: concepts || [],
                entitiesFound: searchResult.entities.length,
                entitiesConverted: conversionResult.ragnoEntities.length,
                duration: Date.now() - startTime,
                timestamp: new Date().toISOString(),
                stored: storageResult?.success || false
            };
            
            this.stats.researchSessions.push(sessionData);

            return {
                success: true,
                question,
                concepts: concepts || [],
                wikidataEntities: searchResult.entities,
                ragnoEntities: conversionResult.ragnoEntities,
                storageResult,
                metadata: {
                    researchDuration: Date.now() - startTime,
                    conceptsUsed: concepts.length,
                    entitiesFound: searchResult.entities.length,
                    entitiesConverted: conversionResult.ragnoEntities.length,
                    stored: storageResult?.success || false,
                    timestamp: new Date().toISOString(),
                    config: researchConfig
                }
            };

        } catch (error) {
            logger.error('Wikidata research failed:', error.message);
            return {
                success: false,
                error: error.message,
                question: input.question,
                concepts: [],
                ragnoEntities: [],
                metadata: {
                    researchDuration: Date.now() - startTime,
                    errorOccurred: true,
                    timestamp: new Date().toISOString()
                }
            };
        }
    }

    /**
     * Get research statistics
     * 
     * @param {Object} input - Statistics input (can be empty)
     * @param {Object} resources - External dependencies (unused)
     * @param {Object} options - Configuration options
     * @param {boolean} options.includeSessionDetails - Include session details (default: false)
     * @returns {Object} Research statistics
     */
    getStatistics(input = {}, resources = {}, options = {}) {
        const stats = {
            totalResearches: this.stats.totalResearches,
            conceptsExtracted: this.stats.conceptsExtracted,
            entitiesFound: this.stats.entitiesFound,
            entitiesConverted: this.stats.entitiesConverted,
            averageEntitiesPerResearch: this.stats.totalResearches > 0 ? 
                Math.round(this.stats.entitiesFound / this.stats.totalResearches) : 0,
            conversionRate: this.stats.entitiesFound > 0 ? 
                (this.stats.entitiesConverted / this.stats.entitiesFound) : 0
        };

        if (options.includeSessionDetails) {
            stats.recentSessions = this.stats.researchSessions.slice(-5); // Last 5 sessions
        }

        return {
            success: true,
            statistics: stats,
            metadata: {
                timestamp: new Date().toISOString()
            }
        };
    }

    /**
     * Extract concepts from text using LLM
     * @private
     */
    async _extractConcepts(text, llmHandler, config) {
        try {
            const prompt = `Extract 3-5 key concepts from this text that could be searched in Wikidata:

"${text}"

Return only the concepts, one per line, without explanations or numbers.`;

            const response = await llmHandler.generateResponse(prompt);
            
            // Parse concepts from response
            const concepts = response
                .split('\n')
                .map(line => line.trim())
                .filter(line => line.length > 2 && line.length < 50)
                .slice(0, 5); // Limit to 5 concepts

            return {
                success: true,
                concepts,
                extractionMethod: 'llm'
            };

        } catch (error) {
            logger.debug('Concept extraction failed:', error.message);
            return {
                success: false,
                error: error.message,
                concepts: []
            };
        }
    }

    /**
     * Search Wikidata for entities based on concepts
     * @private
     */
    async _searchWikidataEntities(concepts, config) {
        try {
            const wikidataSearch = new WikidataSearch({
                maxResults: config.maxWikidataSearchResults,
                minConfidence: config.minEntityConfidence,
                enableHierarchy: config.enableHierarchySearch
            });

            const allEntities = [];
            const searchPromises = concepts.map(concept => 
                wikidataSearch.searchByText(concept)
                    .then(result => result.success ? result.entities : [])
                    .catch(() => [])
            );

            const searchResults = await Promise.all(searchPromises);
            
            // Flatten and deduplicate entities
            const entityMap = new Map();
            searchResults.forEach(entities => {
                entities.forEach(entity => {
                    if (!entityMap.has(entity.id)) {
                        entityMap.set(entity.id, entity);
                    }
                });
            });

            const entities = Array.from(entityMap.values())
                .slice(0, config.maxWikidataSearchResults);

            return {
                success: true,
                entities,
                conceptsSearched: concepts.length,
                totalFound: entities.length
            };

        } catch (error) {
            logger.error('Wikidata entity search failed:', error.message);
            return {
                success: false,
                error: error.message,
                entities: []
            };
        }
    }

    /**
     * Convert Wikidata entities to Ragno format
     * @private
     */
    async _convertToRagno(entities, originalQuestion, config) {
        try {
            const ragnoConverter = new WikidataToRagno({
                enableDescriptions: true,
                enableProperties: true,
                maxPropertiesPerEntity: 5
            });

            const ragnoEntities = [];
            
            for (const entity of entities) {
                try {
                    const ragnoEntity = await ragnoConverter.convertEntity(entity, {
                        sourceQuestion: originalQuestion,
                        timestamp: new Date().toISOString()
                    });
                    
                    if (ragnoEntity && ragnoEntity.success) {
                        // Create a proper entity object with label for display
                        const entityObject = {
                            uri: ragnoEntity.entityURI,
                            label: entity.label || entity.id,
                            description: entity.description || '',
                            type: 'wikidata-entity',
                            wikidataId: entity.id,
                            triples: ragnoEntity.ragnoTriples,
                            originalEntity: entity,
                            conversionMetadata: {
                                convertedAt: new Date().toISOString(),
                                sourceQuestion: originalQuestion,
                                conversionMethod: 'wikidata-to-ragno',
                                propertyCount: ragnoEntity.propertyCount || 0
                            }
                        };
                        ragnoEntities.push(entityObject);
                    }
                } catch (conversionError) {
                    logger.debug(`Failed to convert entity ${entity.id}:`, conversionError.message);
                }
            }

            return {
                success: true,
                ragnoEntities,
                originalCount: entities.length,
                convertedCount: ragnoEntities.length
            };

        } catch (error) {
            logger.error('Ragno conversion failed:', error.message);
            return {
                success: false,
                error: error.message,
                ragnoEntities: []
            };
        }
    }

    /**
     * Store research results in knowledge graph
     * @private
     */
    async _storeResults(ragnoEntities, originalQuestion, concepts, sparqlHelper, config) {
        try {
            const triples = [];
            
            // Create research session metadata
            const sessionURI = `${config.storageGraph}/session/${Date.now()}`;
            triples.push(`<${sessionURI}> a ragno:ResearchSession ;`);
            triples.push(`    ragno:originalQuestion "${this._escapeRDFString(originalQuestion)}" ;`);
            triples.push(`    ragno:conceptsUsed "${concepts.join(', ')}" ;`);
            triples.push(`    ragno:entitiesFound ${ragnoEntities.length} ;`);
            triples.push(`    dcterms:created "${new Date().toISOString()}"^^xsd:dateTime .`);

            // Add entity triples
            ragnoEntities.forEach(entity => {
                if (entity.triples) {
                    triples.push(...entity.triples);
                }
            });

            if (triples.length > 0) {
                const insertQuery = `
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX ragno: <http://purl.org/stuff/ragno/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX prov: <http://www.w3.org/ns/prov#>

INSERT DATA {
    GRAPH <${config.storageGraph}> {
        ${triples.join('\n        ')}
    }
}`;

                const result = await sparqlHelper.executeUpdate(insertQuery);
                
                return {
                    success: result.success,
                    triplesStored: triples.length,
                    sessionURI,
                    graph: config.storageGraph
                };
            }

            return {
                success: true,
                triplesStored: 0,
                message: 'No triples to store'
            };

        } catch (error) {
            logger.error('Failed to store research results:', error.message);
            return {
                success: false,
                error: error.message,
                triplesStored: 0
            };
        }
    }

    /**
     * Escape special characters in RDF strings
     * @private
     */
    _escapeRDFString(str) {
        return str
            .replace(/\\/g, '\\\\')
            .replace(/"/g, '\\"')
            .replace(/\n/g, '\\n')
            .replace(/\r/g, '\\r')
            .replace(/\t/g, '\\t');
    }

    /**
     * Clear statistics
     */
    clearStatistics() {
        this.stats = {
            totalResearches: 0,
            conceptsExtracted: 0,
            entitiesFound: 0,
            entitiesConverted: 0,
            researchSessions: []
        };
    }
}