import rdf from 'rdf-ext';
import SemanticUnit from './SemanticUnit.js';
import Entity from './Entity.js';
import Relationship from './Relationship.js';
import RDFGraphManager from './core/RDFGraphManager.js';
import NamespaceManager from './core/NamespaceManager.js';
import { logger } from '../Utils.js';
import { getPromptManager, PromptContext, PromptOptions } from '../prompts/index.js';
import ParseHelper from '../utils/ParseHelper.js';
/**
* Decompose text chunks into RDF-based semantic units, entities, and relationships
* @param {Array<{content: string, source: string}>} textChunks - Text chunks to decompose
* @param {Object} llmHandler - LLM handler instance for executing prompts
* @param {Object} [options] - Decomposition options
* @returns {Promise<{units: SemanticUnit[], entities: Entity[], relationships: Relationship[], dataset: Dataset}>}
*/
export async function decomposeCorpus(textChunks, llmHandler, options = {}) {
const startTime = Date.now();
logger.info(`Starting corpus decomposition: ${textChunks.length} chunks`);
const promptManager = getPromptManager();
// Ensure we have an LLM handler
if (!llmHandler) {
throw new Error('LLM handler is required for corpus decomposition');
}
const opts = {
extractRelationships: options.extractRelationships !== false,
generateSummaries: options.generateSummaries !== false,
minEntityConfidence: options.minEntityConfidence || 0.3,
maxEntitiesPerUnit: options.maxEntitiesPerUnit || 10,
model: options.model || 'mistral-small-latest', // Allow model override
...options
};
const namespaceManager = new NamespaceManager();
const rdfManager = new RDFGraphManager({ namespace: namespaceManager });
const dataset = rdf.dataset();
const units = [];
const entitiesMap = new Map();
const relationships = [];
const unitEntityConnections = [];
try {
for (let chunkIndex = 0; chunkIndex < textChunks.length; chunkIndex++) {
const chunk = textChunks[chunkIndex];
logger.debug(`Processing chunk ${chunkIndex + 1}/${textChunks.length}`);
const unitTexts = await extractSemanticUnits(chunk.content, promptManager, llmHandler, opts);
for (let unitIndex = 0; unitIndex < unitTexts.length; unitIndex++) {
const unitText = unitTexts[unitIndex];
const unitId = `unit_${chunkIndex}_${unitIndex}`;
let summary = '';
if (opts.generateSummaries && unitText.length > 100) {
summary = await generateUnitSummary(unitText, promptManager, llmHandler, opts);
}
const unit = new SemanticUnit({
dataset: rdf.dataset(),
text: unitText,
summary: summary,
source: chunk.source,
position: 0,
length: unitText.length
});
units.push(unit);
unit.exportToDataset(dataset);
const unitEntities = await extractEntitiesFromUnit(unitText, promptManager, llmHandler, opts);
for (const entityData of unitEntities) {
let entity = entitiesMap.get(entityData.name);
if (!entity) {
entity = new Entity({
name: entityData.name,
isEntryPoint: entityData.isEntryPoint || false,
subType: entityData.type || 'general',
confidence: entityData.confidence || 1.0,
alternativeLabels: entityData.alternatives || [],
source: chunk.source
});
entitiesMap.set(entityData.name, entity);
entity.exportToDataset(dataset);
} else {
entity.incrementFrequency();
entity.addSource(chunk.source);
}
unit.addEntityMention(entity.getURI(), entityData.relevance || 1.0);
unitEntityConnections.push({
unit: unit,
entity: entity,
relevance: entityData.relevance || 1.0,
context: unitText
});
}
logger.debug(`Unit ${unitId}: ${unitEntities.length} entities extracted`);
}
}
if (opts.extractRelationships && entitiesMap.size > 1) {
logger.info('Phase 2: Extracting relationships between entities...');
const entityList = Array.from(entitiesMap.values());
const relationshipData = await extractRelationships(entityList, units, promptManager, llmHandler, opts);
for (const relData of relationshipData) {
const sourceEntity = entitiesMap.get(relData.source);
const targetEntity = entitiesMap.get(relData.target);
if (sourceEntity && targetEntity) {
const relationship = new Relationship({
id: `rel_${relationships.length}`,
sourceEntity: sourceEntity.getURI(),
targetEntity: targetEntity.getURI(),
relationshipType: relData.type || 'related',
content: relData.content || '',
weight: relData.weight || 0.5,
evidence: relData.evidence || [],
bidirectional: relData.bidirectional || false
});
relationships.push(relationship);
relationship.exportToDataset(dataset);
}
}
logger.info(`Created ${relationships.length} relationships`);
}
await createInterUnitRelationships(units, dataset, rdfManager);
const processingTime = Date.now() - startTime;
logger.info(`Corpus decomposition completed in ${processingTime}ms: ${units.length} units, ${entitiesMap.size} entities, ${relationships.length} relationships`);
return {
units,
entities: Array.from(entitiesMap.values()),
relationships,
dataset,
connections: unitEntityConnections,
statistics: {
processingTime,
totalChunks: textChunks.length,
totalUnits: units.length,
totalEntities: entitiesMap.size,
totalRelationships: relationships.length,
averageEntitiesPerUnit: entitiesMap.size / units.length
}
};
} catch (error) {
logger.error('Corpus decomposition failed:', error);
throw error;
}
}
async function extractSemanticUnits(text, promptManager, llmHandler, options) {
const context = new PromptContext({
arguments: { text },
model: options.model
});
const promptOptions = new PromptOptions({ temperature: 0.1 });
try {
const result = await promptManager.generatePrompt('decomposition-extract-units', context, promptOptions);
const response = await llmHandler.generateResponse(result.content, '', {
model: options.model,
temperature: 0.1
});
const cleanedResponse = ParseHelper.resolveSyntax(response);
const units = JSON.parse(cleanedResponse);
return Array.isArray(units) ? units : [text];
} catch (error) {
logger.warn('LLM unit extraction failed, using sentence splitting fallback:', error.message);
return text.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 10);
}
}
async function generateUnitSummary(unitText, promptManager, llmHandler, options) {
const context = new PromptContext({
arguments: { unitText },
model: options.model
});
const promptOptions = new PromptOptions({ temperature: 0.1, maxTokens: 100 });
try {
const result = await promptManager.generatePrompt('decomposition-summarize-unit', context, promptOptions);
const response = await llmHandler.generateResponse(result.content, '', {
model: options.model,
temperature: 0.1
});
return response.trim();
} catch (error) {
logger.warn('Summary generation failed:', error.message);
return unitText.substring(0, 100) + '...';
}
}
async function extractEntitiesFromUnit(unitText, promptManager, llmHandler, options) {
const context = new PromptContext({
arguments: { unitText },
model: options.model
});
const promptOptions = new PromptOptions({ temperature: 0.1, maxTokens: 500 });
try {
const result = await promptManager.generatePrompt('decomposition-extract-entities', context, promptOptions);
const response = await llmHandler.generateResponse(result.content, '', {
model: options.model,
temperature: 0.1
});
const cleanedResponse = ParseHelper.resolveSyntax(response);
const entities = JSON.parse(cleanedResponse);
return Array.isArray(entities) ? entities.filter(e => e.name && e.name.length > 1 && (e.confidence || 1.0) >= options.minEntityConfidence).slice(0, options.maxEntitiesPerUnit) : [];
} catch (error) {
logger.warn('Entity extraction failed, using fallback:', error.message);
const words = unitText.match(/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/g) || [];
return words.slice(0, options.maxEntitiesPerUnit).map(name => ({ name, type: 'general', relevance: 0.5, isEntryPoint: false, confidence: 0.5 }));
}
}
async function extractRelationships(entities, units, promptManager, llmHandler, options) {
const relationships = [];
const entityNames = entities.map(e => e.getPreferredLabel() || '').filter(Boolean);
for (const unit of units) {
const unitEntityNames = entityNames.filter(name => unit.getContent().toLowerCase().includes(name.toLowerCase()));
if (unitEntityNames.length < 2) continue;
const context = new PromptContext({
arguments: {
entityNames: JSON.stringify(unitEntityNames),
unitText: unit.getContent()
},
model: options.model
});
const promptOptions = new PromptOptions({ temperature: 0.1, maxTokens: 300 });
try {
const result = await promptManager.generatePrompt('decomposition-extract-relationships', context, promptOptions);
const response = await llmHandler.generateResponse(result.content, '', {
model: options.model,
temperature: 0.1
});
const cleanedResponse = ParseHelper.resolveSyntax(response);
const unitRelationships = JSON.parse(cleanedResponse);
if (Array.isArray(unitRelationships)) {
for (const rel of unitRelationships) {
if (rel.source && rel.target && rel.source !== rel.target) {
relationships.push({ ...rel, evidence: [unit.getURI()], bidirectional: rel.bidirectional || false });
}
}
}
} catch (error) {
logger.warn(`Relationship extraction failed for unit: ${error.message}`);
}
}
return relationships;
}
async function createInterUnitRelationships(units, dataset, rdfManager) {
logger.debug('Creating inter-unit relationships...');
for (let i = 0; i < units.length - 1; i++) {
const currentUnit = units[i];
const nextUnit = units[i + 1];
const relationship = new Relationship({
id: `unit_rel_${i}`,
sourceEntity: currentUnit.getURI(),
targetEntity: nextUnit.getURI(),
relationshipType: 'follows',
content: 'Sequential narrative flow',
weight: 0.3,
bidirectional: false
});
relationship.exportToDataset(dataset);
}
logger.debug(`Created ${units.length - 1} inter-unit relationships`);
}