/**
* Memorise.js - Integrated Memory Ingestion Module for Semem
*
* This module provides a unified interface for ingesting text into the semem
* memory system by orchestrating the complete document processing pipeline:
*
* Text Input → ragno:Unit → ragno:TextElement → Chunking → Embeddings
* → Concept Extraction → Enhancement → Decomposition → SPARQL Storage
*
* Based on the patterns from:
* - examples/document/LoadPDFs.js
* - examples/document/ChunkDocuments.js
* - examples/document/MakeEmbeddings.js
* - examples/document/ExtractConcepts.js
* - examples/document/EnhanceCorpuscles.js
* - examples/document/Decompose.js
*
* Follows infrastructure patterns from docs/manual/infrastructure.md
* Uses SPARQL service layer as described in docs/manual/sparql-service.md
*/
import Config from '../Config.js';
import { SPARQLQueryService } from '../services/sparql/index.js';
import SPARQLHelper from '../services/sparql/SPARQLHelper.js';
import LLMHandler from '../handlers/LLMHandler.js';
import EmbeddingHandler from '../handlers/EmbeddingHandler.js';
import EmbeddingConnectorFactory from '../connectors/EmbeddingConnectorFactory.js';
import MistralConnector from '../connectors/MistralConnector.js';
import ClaudeConnector from '../connectors/ClaudeConnector.js';
import OllamaConnector from '../connectors/OllamaConnector.js';
import Chunker from '../services/document/Chunker.js';
import { URIMinter } from '../utils/URIMinter.js';
import { decomposeCorpus } from './decomposeCorpus.js';
import { CreateConceptsUnified } from './CreateConceptsUnified.js';
import logger from 'loglevel';
import crypto from 'crypto';
export default class Memorise {
constructor(configPath = null) {
this.config = null;
this.queryService = null;
this.sparqlHelper = null;
this.llmHandler = null;
this.embeddingHandler = null;
this.chunker = null;
this.conceptExtractor = null;
this.initialized = false;
this.configPath = configPath;
// Statistics tracking
this.stats = {
textLength: 0,
unitsCreated: 0,
textElementsCreated: 0,
chunksCreated: 0,
embeddingsCreated: 0,
conceptsExtracted: 0,
entitiesCreated: 0,
relationshipsCreated: 0,
processingTimeMs: 0,
errors: []
};
}
/**
* Initialize all required services and handlers
*/
async init() {
if (this.initialized) return;
logger.info('🚀 Initializing Memorise module...');
try {
// 1. Initialize configuration
await this.initializeConfig();
// 2. Initialize SPARQL services
await this.initializeSPARQLServices();
// 3. Initialize LLM and embedding handlers
await this.initializeLLMHandler();
await this.initializeEmbeddingHandler();
// 4. Initialize document processing components
await this.initializeDocumentProcessing();
this.initialized = true;
logger.info('✅ Memorise module initialized successfully');
} catch (error) {
logger.error('❌ Failed to initialize Memorise module:', error.message);
throw error;
}
}
/**
* Initialize configuration following semem patterns
*/
async initializeConfig() {
const configPath = this.configPath ||
(process.cwd().endsWith('/examples/document')
? '../../config/config.json'
: 'config/config.json');
this.config = new Config(configPath);
await this.config.init();
logger.debug(`Configuration loaded from: ${configPath}`);
}
/**
* Initialize SPARQL services following infrastructure patterns
*/
async initializeSPARQLServices() {
// Initialize query service
this.queryService = new SPARQLQueryService({
queryPath: process.cwd().endsWith('/examples/document')
? '../../sparql/queries'
: 'sparql/queries'
});
// Initialize SPARQL helper
const storageConfig = this.config.get('storage');
if (storageConfig.type !== 'sparql') {
throw new Error('Memorise requires SPARQL storage configuration');
}
this.sparqlHelper = new SPARQLHelper(storageConfig.options.update, {
auth: {
user: storageConfig.options.user,
password: storageConfig.options.password
}
});
logger.debug('SPARQL services initialized');
}
/**
* Initialize LLM handler with priority-based provider selection
*/
async initializeLLMHandler() {
try {
const llmProviders = this.config.get('llmProviders') || [];
const chatProviders = llmProviders.filter(p => p.capabilities?.includes('chat'));
const sortedProviders = chatProviders.sort((a, b) => (a.priority || 999) - (b.priority || 999));
let llmProvider = null;
let chatModel = null;
// Try providers in priority order
for (const provider of sortedProviders) {
try {
if (provider.type === 'mistral' && process.env.MISTRAL_API_KEY) {
chatModel = provider.chatModel || 'mistral-small-latest';
llmProvider = new MistralConnector(process.env.MISTRAL_API_KEY);
logger.info(`Using Mistral LLM with model: ${chatModel}`);
break;
} else if (provider.type === 'claude' && process.env.ANTHROPIC_API_KEY) {
chatModel = provider.chatModel || 'claude-3-haiku-20240307';
llmProvider = new ClaudeConnector(process.env.ANTHROPIC_API_KEY);
logger.info(`Using Claude LLM with model: ${chatModel}`);
break;
} else if (provider.type === 'ollama') {
chatModel = provider.chatModel || 'qwen2:1.5b';
const ollamaBaseUrl = provider.baseUrl || this.config.get('ollama.baseUrl') || 'http://localhost:11434';
llmProvider = new OllamaConnector(ollamaBaseUrl, chatModel);
logger.info(`Using Ollama LLM at: ${ollamaBaseUrl} with model: ${chatModel}`);
break;
}
} catch (error) {
logger.warn(`Failed to initialize ${provider.type} provider: ${error.message}`);
continue;
}
}
// Fallback to Ollama if no other provider works
if (!llmProvider) {
logger.warn('No configured LLM provider available, falling back to Ollama');
const ollamaBaseUrl = this.config.get('ollama.baseUrl') || 'http://localhost:11434';
chatModel = 'qwen2:1.5b';
llmProvider = new OllamaConnector(ollamaBaseUrl, chatModel);
}
this.llmHandler = new LLMHandler(llmProvider, chatModel);
logger.debug(`LLM handler initialized with ${chatModel}`);
} catch (error) {
logger.error('Failed to initialize LLM handler:', error.message);
// Emergency fallback
this.llmHandler = new LLMHandler(new OllamaConnector('http://localhost:11434'), 'qwen2:1.5b');
logger.warn('Using emergency fallback to Ollama');
}
}
/**
* Initialize embedding handler with provider selection
*/
async initializeEmbeddingHandler() {
try {
const embeddingProviders = this.config.get('llmProviders') || [];
const sortedProviders = embeddingProviders
.filter(p => p.capabilities?.includes('embedding'))
.sort((a, b) => (a.priority || 999) - (b.priority || 999));
let providerConfig = null;
let embeddingDimension = 1536; // Default
// Try providers in priority order
for (const provider of sortedProviders) {
if (provider.type === 'nomic' && process.env.NOMIC_API_KEY) {
providerConfig = {
provider: 'nomic',
apiKey: process.env.NOMIC_API_KEY,
model: provider.embeddingModel || 'nomic-embed-text'
};
embeddingDimension = 768;
logger.info(`Using Nomic embedding provider`);
break;
} else if (provider.type === 'ollama') {
const ollamaBaseUrl = provider.baseUrl || this.config.get('ollama.baseUrl') || 'http://localhost:11434';
providerConfig = {
provider: 'ollama',
baseUrl: ollamaBaseUrl,
model: provider.embeddingModel || 'nomic-embed-text'
};
embeddingDimension = 1536;
logger.info(`Using Ollama embedding provider at: ${ollamaBaseUrl}`);
break;
}
}
// Fallback to Ollama
if (!providerConfig) {
const ollamaBaseUrl = this.config.get('ollama.baseUrl') || 'http://localhost:11434';
providerConfig = {
provider: 'ollama',
baseUrl: ollamaBaseUrl,
model: 'nomic-embed-text'
};
logger.info(`Defaulting to Ollama embedding provider`);
}
const embeddingConnector = EmbeddingConnectorFactory.createConnector(providerConfig);
this.embeddingHandler = new EmbeddingHandler(embeddingConnector, providerConfig.model, embeddingDimension);
logger.debug(`Embedding handler initialized with ${providerConfig.provider} connector`);
} catch (error) {
logger.error('Failed to initialize embedding handler:', error.message);
throw error;
}
}
/**
* Initialize document processing components
*/
async initializeDocumentProcessing() {
// Initialize chunker
this.chunker = new Chunker({
maxChunkSize: 2000,
minChunkSize: 100,
overlapSize: 100,
strategy: 'semantic',
baseNamespace: 'http://purl.org/stuff/instance/'
});
// Initialize concept extractor
this.conceptExtractor = new CreateConceptsUnified();
await this.conceptExtractor.init();
logger.debug('Document processing components initialized');
}
/**
* Main method to ingest text into memory system
* @param {string} text - Text content to memorize
* @param {Object} options - Processing options
* @returns {Object} Processing results and statistics
*/
async memorize(text, options = {}) {
if (!this.initialized) {
await this.init();
}
// Input validation
if (typeof text !== 'string') {
throw new Error('Text input must be a string');
}
if (text.trim().length === 0) {
throw new Error('Text input cannot be empty');
}
const startTime = Date.now();
this.stats = { ...this.stats, textLength: text.length, processingTimeMs: 0, errors: [] };
logger.info(`🧠 Starting memory ingestion for ${text.length} characters of text...`);
try {
const targetGraph = options.graph || this.config.get('storage.options.graphName') ||
this.config.get('graphName') || 'http://hyperdata.it/content';
// Step 1: Create ragno:Unit and ragno:TextElement
logger.info('📄 Step 1: Creating ragno:Unit and ragno:TextElement...');
const { unitURI, textElementURI } = await this.createTextUnit(text, targetGraph, options);
// Step 2: Chunk the text
logger.info('✂️ Step 2: Chunking text...');
const chunks = await this.chunkText(textElementURI, text, targetGraph);
// Step 3: Create embeddings for chunks
logger.info('🔢 Step 3: Creating embeddings...');
await this.createEmbeddings(chunks, targetGraph);
// Step 4: Extract concepts (optional, controlled by extractConcepts option)
let decompositionResults = null;
if (options.extractConcepts) {
logger.info('🧠 Step 4: Extracting concepts...');
await this.extractConcepts(chunks, targetGraph);
// Step 5: Decompose into entities and relationships
logger.info('🕸️ Step 5: Decomposing into entities and relationships...');
decompositionResults = await this.decomposeText(chunks, targetGraph);
} else {
logger.info('⏭️ Step 4: Skipping concept extraction (--augment not specified)');
logger.info('⏭️ Step 5: Skipping entity decomposition (--augment not specified)');
}
// Calculate final statistics
this.stats.processingTimeMs = Date.now() - startTime;
logger.info('✅ Memory ingestion completed successfully');
this.logProcessingSummary();
return {
success: true,
unitURI,
textElementURI,
chunks: chunks.length,
decompositionResults,
statistics: { ...this.stats }
};
} catch (error) {
this.stats.errors.push(error.message);
this.stats.processingTimeMs = Date.now() - startTime;
logger.error('❌ Memory ingestion failed:', error.message);
throw error;
}
}
/**
* Create ragno:Unit and ragno:TextElement from input text
*/
async createTextUnit(text, targetGraph, options = {}) {
const title = options.title || `Memory ingestion ${new Date().toISOString()}`;
const source = options.source || 'direct-input';
// Generate URIs
const unitURI = URIMinter.mintURI('http://purl.org/stuff/instance/', 'unit', text.substring(0, 100));
const textElementURI = URIMinter.mintURI('http://purl.org/stuff/instance/', 'text', text);
const now = new Date().toISOString();
// Create SPARQL INSERT query for Unit and TextElement
const updateQuery = `
PREFIX ragno: <http://purl.org/stuff/ragno/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX semem: <http://semem.hyperdata.it/>
INSERT DATA {
GRAPH <${targetGraph}> {
# Store ragno:Unit
<${unitURI}> a ragno:Unit ;
rdfs:label """${title.replace(/"/g, '\\"')}""" ;
dcterms:created "${now}"^^xsd:dateTime ;
semem:sourceFile "${source}" ;
ragno:hasTextElement <${textElementURI}> .
# Store ragno:TextElement
<${textElementURI}> a ragno:TextElement ;
rdfs:label """${title} text content""" ;
dcterms:created "${now}"^^xsd:dateTime ;
ragno:content """${text.replace(/"/g, '\\"')}""" ;
dcterms:extent ${text.length} ;
prov:wasDerivedFrom <${unitURI}> .
}
}
`;
await this.sparqlHelper.executeUpdate(updateQuery);
this.stats.unitsCreated = 1;
this.stats.textElementsCreated = 1;
logger.debug(`Created ragno:Unit: ${unitURI}`);
logger.debug(`Created ragno:TextElement: ${textElementURI}`);
return { unitURI, textElementURI };
}
/**
* Chunk text using the document Chunker
*/
async chunkText(textElementURI, text, targetGraph) {
const chunkingResult = await this.chunker.chunk(text, {
title: `TextElement ${textElementURI.split('/').pop()}`,
sourceUri: textElementURI
});
logger.info(`Created ${chunkingResult.chunks.length} chunks`);
// Generate URIs and store chunks using OLO structure (following ChunkDocuments.js pattern)
const chunkListURI = URIMinter.mintURI('http://purl.org/stuff/instance/', 'chunklist', textElementURI);
const chunkTriples = [];
const slotTriples = [];
chunkingResult.chunks.forEach((chunk, index) => {
const chunkURI = chunk.uri;
const slotURI = URIMinter.mintURI('http://purl.org/stuff/instance/', 'slot', `${textElementURI}-${index}`);
// Chunk as both ragno:Unit and ragno:TextElement for embeddings
chunkTriples.push(`
<${chunkURI}> a ragno:Unit, ragno:TextElement ;
ragno:content """${chunk.content.replace(/"/g, '\\"')}""" ;
dcterms:extent ${chunk.size} ;
olo:index ${chunk.index} ;
prov:wasDerivedFrom <${textElementURI}> ;
dcterms:created "${new Date().toISOString()}"^^xsd:dateTime .`);
// OLO slot structure
slotTriples.push(`
<${slotURI}> a olo:Slot ;
olo:index ${index + 1} ;
olo:item <${chunkURI}> ;
olo:ordered_list <${chunkListURI}> .
<${chunkListURI}> olo:slot <${slotURI}> .`);
});
// Store chunks with OLO indexing
const chunksUpdateQuery = `
PREFIX ragno: <http://purl.org/stuff/ragno/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX olo: <http://purl.org/ontology/olo/core#>
PREFIX semem: <http://semem.hyperdata.it/>
INSERT DATA {
GRAPH <${targetGraph}> {
# Mark original TextElement as having chunks
<${textElementURI}> semem:hasChunks true .
# Create chunk list
<${chunkListURI}> a olo:OrderedList ;
rdfs:label "Chunks for ${textElementURI.split('/').pop()}" ;
olo:length ${chunkingResult.chunks.length} ;
dcterms:created "${new Date().toISOString()}"^^xsd:dateTime .
# Store chunks
${chunkTriples.join('\n')}
# Store OLO slots
${slotTriples.join('\n')}
}
}
`;
await this.sparqlHelper.executeUpdate(chunksUpdateQuery);
this.stats.chunksCreated = chunkingResult.chunks.length;
return chunkingResult.chunks;
}
/**
* Create embeddings for all chunks
*/
async createEmbeddings(chunks, targetGraph) {
let embeddingsCreated = 0;
for (const chunk of chunks) {
try {
// Generate embedding
const embedding = await this.embeddingHandler.generateEmbedding(chunk.content);
const embeddingString = embedding.join(',');
// Store embedding
const embeddingUpdateQuery = `
PREFIX ragno: <http://purl.org/stuff/ragno/>
INSERT DATA {
GRAPH <${targetGraph}> {
<${chunk.uri}> ragno:embedding "${embeddingString}" .
}
}
`;
await this.sparqlHelper.executeUpdate(embeddingUpdateQuery);
embeddingsCreated++;
} catch (error) {
logger.warn(`Failed to create embedding for chunk ${chunk.uri}: ${error.message}`);
this.stats.errors.push(`Embedding failed for chunk: ${error.message}`);
}
}
this.stats.embeddingsCreated = embeddingsCreated;
logger.info(`Created ${embeddingsCreated} embeddings`);
}
/**
* Extract concepts from specific chunks
*/
async extractConcepts(chunks, targetGraph) {
logger.info(`Processing concepts for ${chunks.length} specific chunks from current ingestion`);
let conceptsExtracted = 0;
const conceptResults = [];
for (const chunk of chunks) {
try {
// Create a mock textElement object for the conceptExtractor
const textElement = {
textElement: { value: chunk.uri },
content: { value: chunk.content }
};
const result = await this.conceptExtractor.processTextElement(textElement, targetGraph);
conceptResults.push(result);
conceptsExtracted += result.conceptCount;
logger.debug(`Extracted ${result.conceptCount} concepts from chunk: ${chunk.uri}`);
} catch (error) {
logger.warn(`Failed to extract concepts from chunk ${chunk.uri}: ${error.message}`);
this.stats.errors.push(`Concept extraction failed for chunk: ${error.message}`);
}
}
this.stats.conceptsExtracted = conceptsExtracted;
logger.info(`Extracted ${conceptsExtracted} concepts from ${chunks.length} chunks`);
return conceptResults;
}
/**
* Decompose text into entities and relationships using ragno
*/
async decomposeText(chunks, targetGraph) {
const textChunks = chunks.map(chunk => ({
content: chunk.content,
source: chunk.uri,
metadata: {
sourceUnit: chunk.uri
}
}));
try {
const decompositionResults = await decomposeCorpus(textChunks, this.llmHandler, {
extractRelationships: true,
generateSummaries: true,
minEntityConfidence: 0.4,
maxEntitiesPerUnit: 15
});
// Store the RDF dataset from decomposition results
if (decompositionResults.dataset && decompositionResults.dataset.size > 0) {
logger.info(`Storing decomposition results: ${decompositionResults.dataset.size} triples`);
// Convert dataset to properly escaped SPARQL triples
const quads = Array.from(decompositionResults.dataset);
const triples = quads.map(quad => {
const subject = quad.subject.termType === 'NamedNode' ? `<${quad.subject.value}>` : quad.subject.value;
const predicate = `<${quad.predicate.value}>`;
let object;
if (quad.object.termType === 'NamedNode') {
object = `<${quad.object.value}>`;
} else if (quad.object.termType === 'Literal') {
// Use SPARQLHelper for proper literal escaping
if (quad.object.datatype) {
object = SPARQLHelper.createLiteral(quad.object.value, quad.object.datatype.value);
} else if (quad.object.language) {
object = SPARQLHelper.createLiteral(quad.object.value, null, quad.object.language);
} else {
object = SPARQLHelper.createLiteral(quad.object.value);
}
} else {
object = quad.object.value;
}
return `${subject} ${predicate} ${object} .`;
}).join('\n');
// Use SPARQLHelper to create and execute the insert query
const insertQuery = this.sparqlHelper.createInsertDataQuery(targetGraph, triples);
const result = await this.sparqlHelper.executeUpdate(insertQuery);
if (!result.success) {
throw new Error(`SPARQL insert failed: ${result.error}`);
}
}
this.stats.entitiesCreated = decompositionResults.entities?.length || 0;
this.stats.relationshipsCreated = decompositionResults.relationships?.length || 0;
logger.info(`Created ${this.stats.entitiesCreated} entities and ${this.stats.relationshipsCreated} relationships`);
return decompositionResults;
} catch (error) {
logger.error(`Decomposition failed: ${error.message}`);
this.stats.errors.push(`Decomposition failed: ${error.message}`);
throw error;
}
}
/**
* Log processing summary
*/
logProcessingSummary() {
logger.info('\n📊 Memory Ingestion Summary:');
logger.info(` 📄 Text length: ${this.stats.textLength} characters`);
logger.info(` 🏗️ Units created: ${this.stats.unitsCreated}`);
logger.info(` 📝 Text elements: ${this.stats.textElementsCreated}`);
logger.info(` ✂️ Chunks created: ${this.stats.chunksCreated}`);
logger.info(` 🔢 Embeddings: ${this.stats.embeddingsCreated}`);
logger.info(` 💡 Concepts extracted: ${this.stats.conceptsExtracted}`);
logger.info(` 🎯 Entities created: ${this.stats.entitiesCreated}`);
logger.info(` 🔗 Relationships: ${this.stats.relationshipsCreated}`);
logger.info(` ⏱️ Processing time: ${(this.stats.processingTimeMs / 1000).toFixed(2)}s`);
if (this.stats.errors.length > 0) {
logger.warn(` ⚠️ Errors: ${this.stats.errors.length}`);
this.stats.errors.forEach(error => {
logger.warn(` • ${error}`);
});
}
}
/**
* Cleanup resources
*/
async cleanup() {
if (this.conceptExtractor) {
await this.conceptExtractor.cleanup();
}
if (this.embeddingHandler && typeof this.embeddingHandler.dispose === 'function') {
await this.embeddingHandler.dispose();
}
if (this.queryService && typeof this.queryService.cleanup === 'function') {
this.queryService.cleanup();
}
logger.debug('Memorise module cleaned up');
}
}