/**
* VSOM.js - Vectorized Self-Organizing Map for Ragno Knowledge Graphs
*
* This is the main VSOM implementation that integrates with the Ragno knowledge graph
* system to provide entity clustering, visualization, and semantic organization
* capabilities. It combines the core algorithm, topology management, and training
* procedures into a unified interface.
*
* Key Features:
* - Entity clustering for knowledge graphs
* - Integration with SPARQL endpoints and in-memory data
* - RDF export with ragno ontology properties
* - Multiple data input sources
* - Visualization coordinate generation
* - Integration with existing Ragno algorithms
*/
import rdf from 'rdf-ext'
import VSOMCore from './vsom/VSOMCore.js'
import VSOMTopology from './vsom/VSOMTopology.js'
import VSOMTraining from './vsom/VSOMTraining.js'
import NamespaceManager from '../core/NamespaceManager.js'
import { logger } from '../../Utils.js'
export default class VSOM {
constructor(options = {}) {
this.options = {
// Map configuration
mapSize: options.mapSize || [20, 20],
topology: options.topology || 'rectangular',
boundaryCondition: options.boundaryCondition || 'bounded',
// Algorithm parameters
embeddingDimension: options.embeddingDimension || 1536,
distanceMetric: options.distanceMetric || 'cosine',
// Training parameters
maxIterations: options.maxIterations || 1000,
initialLearningRate: options.initialLearningRate || 0.1,
finalLearningRate: options.finalLearningRate || 0.01,
initialRadius: options.initialRadius || Math.max(...(options.mapSize || [20, 20])) / 4,
finalRadius: options.finalRadius || 0.5,
// Data handling
batchSize: options.batchSize || 100,
// Clustering
clusterThreshold: options.clusterThreshold || 0.8,
minClusterSize: options.minClusterSize || 3,
// RDF integration
uriBase: options.uriBase || 'http://example.org/ragno/',
exportToRDF: options.exportToRDF !== false,
// Performance
logProgress: options.logProgress !== false,
...options
}
// Initialize components
this.core = new VSOMCore({
distanceMetric: this.options.distanceMetric,
batchSize: this.options.batchSize
})
this.topology = new VSOMTopology({
topology: this.options.topology,
boundaryCondition: this.options.boundaryCondition,
mapSize: this.options.mapSize
})
this.training = new VSOMTraining({
maxIterations: this.options.maxIterations,
initialLearningRate: this.options.initialLearningRate,
finalLearningRate: this.options.finalLearningRate,
initialRadius: this.options.initialRadius,
finalRadius: this.options.finalRadius,
batchSize: this.options.batchSize,
logProgress: this.options.logProgress
})
this.namespaces = new NamespaceManager({ uriBase: this.options.uriBase })
// Data storage
this.entities = []
this.embeddings = []
this.entityMetadata = []
this.trained = false
this.clusters = null
this.nodeAssignments = null
// Training results
this.trainingResults = null
// Statistics
this.stats = {
totalEntities: 0,
totalClusters: 0,
trainingTime: 0,
lastTrainingDate: null,
dataLoadTime: 0,
lastDataLoadDate: null
}
logger.debug('VSOM initialized with options:', {
mapSize: this.options.mapSize,
topology: this.options.topology,
embeddingDimension: this.options.embeddingDimension
})
}
/**
* Load entities from an array with embedding generation
* @param {Array} entities - Array of Entity objects or entity data
* @param {Object} embeddingHandler - Embedding handler for vector generation
* @param {Object} [options] - Loading options
* @returns {Promise<Object>} Loading results
*/
async loadFromEntities(entities, embeddingHandler, options = {}) {
const startTime = Date.now()
logger.info(`Loading ${entities.length} entities into VSOM`)
this.entities = []
this.embeddings = []
this.entityMetadata = []
const batchSize = options.batchSize || this.options.batchSize
let processedCount = 0
try {
// Process entities in batches
for (let i = 0; i < entities.length; i += batchSize) {
const batch = entities.slice(i, i + batchSize)
for (const entity of batch) {
// Extract entity information
const entityData = this.extractEntityData(entity)
// Generate embedding for entity content
const embedding = await embeddingHandler.generateEmbedding(entityData.content)
// Validate embedding dimension
if (embedding.length !== this.options.embeddingDimension) {
logger.warn(`Embedding dimension mismatch: expected ${this.options.embeddingDimension}, got ${embedding.length}`)
continue
}
this.entities.push(entity)
this.embeddings.push(embedding)
this.entityMetadata.push(entityData)
processedCount++
}
if (this.options.logProgress && (i + batchSize) % (batchSize * 10) === 0) {
logger.info(`Processed ${Math.min(i + batchSize, entities.length)}/${entities.length} entities`)
}
}
const loadTime = Date.now() - startTime
this.stats.totalEntities = processedCount
this.stats.dataLoadTime = loadTime
this.stats.lastDataLoadDate = new Date()
logger.info(`Loaded ${processedCount} entities in ${loadTime}ms`)
return {
entitiesLoaded: processedCount,
entitiesSkipped: entities.length - processedCount,
loadTime: loadTime,
averageEmbeddingTime: loadTime / processedCount
}
} catch (error) {
logger.error('Error loading entities:', error)
throw error
}
}
/**
* Load entities from SPARQL endpoint
* @param {string} endpoint - SPARQL endpoint URL
* @param {string} query - SPARQL query to retrieve entities
* @param {Object} embeddingHandler - Embedding handler for vector generation
* @param {Object} [options] - Loading options
* @returns {Promise<Object>} Loading results
*/
async loadFromSPARQL(endpoint, query, embeddingHandler, options = {}) {
logger.info(`Loading entities from SPARQL endpoint: ${endpoint}`)
try {
// Execute SPARQL query
const sparqlResults = await this.executeSPARQLQuery(endpoint, query, options)
// Convert SPARQL results to entity format
const entities = this.processSPARQLResults(sparqlResults)
// Load the entities
return await this.loadFromEntities(entities, embeddingHandler, options)
} catch (error) {
logger.error('Error loading from SPARQL:', error)
throw error
}
}
/**
* Load entities from existing VectorIndex
* @param {Object} vectorIndex - VectorIndex instance
* @param {Object} [filters] - Filters to apply
* @returns {Promise<Object>} Loading results
*/
async loadFromVectorIndex(vectorIndex, filters = {}) {
logger.info('Loading entities from VectorIndex')
try {
// Get all indexed entities
const indexedEntities = vectorIndex.getAllNodes()
// Apply filters
const filteredEntities = this.applyEntityFilters(indexedEntities, filters)
// Extract entities and embeddings
this.entities = []
this.embeddings = []
this.entityMetadata = []
for (const indexedEntity of filteredEntities) {
this.entities.push(indexedEntity.entity)
this.embeddings.push(indexedEntity.embedding)
this.entityMetadata.push({
uri: indexedEntity.uri,
content: indexedEntity.content,
type: indexedEntity.type,
fromVectorIndex: true
})
}
this.stats.totalEntities = this.entities.length
this.stats.lastDataLoadDate = new Date()
logger.info(`Loaded ${this.entities.length} entities from VectorIndex`)
return {
entitiesLoaded: this.entities.length,
entitiesSkipped: 0,
loadTime: 0
}
} catch (error) {
logger.error('Error loading from VectorIndex:', error)
throw error
}
}
/**
* Train the VSOM on loaded data
* @param {Object} [options] - Training options
* @returns {Promise<Object>} Training results
*/
async train(options = {}) {
if (this.embeddings.length === 0) {
throw new Error('No data loaded. Call loadFromEntities, loadFromSPARQL, or loadFromVectorIndex first.')
}
logger.info(`Training VSOM on ${this.embeddings.length} entities`)
// Initialize core algorithm
this.core.initializeWeights(
this.options.mapSize,
this.options.embeddingDimension,
options.initMethod || 'random'
)
// Execute training
this.trainingResults = await this.training.train(
this.core,
this.topology,
this.embeddings,
{
onIteration: options.onIteration,
onComplete: options.onComplete,
shouldStop: options.shouldStop
}
)
this.trained = true
this.stats.trainingTime = this.trainingResults.trainingTime
this.stats.lastTrainingDate = new Date()
// Generate node assignments
this.generateNodeAssignments()
logger.info(`VSOM training completed: ${this.trainingResults.totalIterations} iterations, ${this.trainingResults.trainingTime}ms`)
return this.trainingResults
}
/**
* Generate cluster assignments for entities
* @param {number} [threshold] - Clustering threshold
* @returns {Array} Array of cluster assignments
*/
getClusters(threshold = null) {
if (!this.trained) {
throw new Error('VSOM must be trained before clustering. Call train() first.')
}
const clusterThreshold = threshold || this.options.clusterThreshold
logger.info(`Generating clusters with threshold ${clusterThreshold}`)
// Use weight similarity for clustering
this.clusters = this.generateClusters(clusterThreshold)
this.stats.totalClusters = this.clusters.length
return this.clusters
}
/**
* Get node mappings (entity to map position)
* @returns {Array} Array of node mappings
*/
getNodeMappings() {
if (!this.nodeAssignments) {
throw new Error('Node assignments not generated. Train the VSOM first.')
}
return this.nodeAssignments.map((assignment, index) => ({
entityIndex: index,
entity: this.entities[index],
mapPosition: this.topology.indexToCoordinates(assignment.nodeIndex),
nodeIndex: assignment.nodeIndex,
distance: assignment.distance,
metadata: this.entityMetadata[index]
}))
}
/**
* Get topology information
* @returns {Object} Topology information
*/
getTopology() {
return this.topology.getTopologyInfo()
}
/**
* Export results to RDF dataset
* @param {Object} dataset - RDF dataset to augment
* @param {Object} [options] - Export options
* @returns {number} Number of triples added
*/
exportToRDF(dataset, options = {}) {
if (!this.trained) {
throw new Error('VSOM must be trained before RDF export')
}
logger.info('Exporting VSOM results to RDF')
let triplesAdded = 0
const clusters = this.clusters || this.getClusters()
const nodeMappings = this.getNodeMappings()
// Export cluster information
for (let clusterIndex = 0; clusterIndex < clusters.length; clusterIndex++) {
const cluster = clusters[clusterIndex]
const clusterUri = this.namespaces.ex(`cluster_${clusterIndex}`)
// Cluster type
dataset.add(rdf.quad(
clusterUri,
this.namespaces.rdf('type'),
this.namespaces.ragno('Cluster')
))
// Cluster properties
dataset.add(rdf.quad(
clusterUri,
this.namespaces.ragno('memberCount'),
rdf.literal(cluster.members.length.toString(), this.namespaces.xsd('integer'))
))
if (cluster.centroid) {
dataset.add(rdf.quad(
clusterUri,
this.namespaces.ragno('clusterCentroid'),
rdf.literal(cluster.centroid.join(','), this.namespaces.ragno('Vector'))
))
}
triplesAdded += 3
}
// Export entity mappings
for (const mapping of nodeMappings) {
const entityUri = rdf.namedNode(mapping.metadata.uri || mapping.entity.uri)
// Map position
dataset.add(rdf.quad(
entityUri,
this.namespaces.ragno('mapPosition'),
rdf.literal(`${mapping.mapPosition[0]},${mapping.mapPosition[1]}`, this.namespaces.xsd('string'))
))
// Find cluster assignment
const clusterIndex = this.findEntityCluster(mapping.entityIndex, clusters)
if (clusterIndex !== -1) {
const clusterUri = this.namespaces.ex(`cluster_${clusterIndex}`)
dataset.add(rdf.quad(
entityUri,
this.namespaces.ragno('cluster'),
clusterUri
))
// Cluster confidence based on distance to BMU
const confidence = Math.max(0, 1 - mapping.distance)
dataset.add(rdf.quad(
entityUri,
this.namespaces.ragno('clusterConfidence'),
rdf.literal(confidence.toFixed(3), this.namespaces.xsd('decimal'))
))
}
triplesAdded += 3
}
logger.info(`Exported ${triplesAdded} RDF triples`)
return triplesAdded
}
/**
* Export visualization coordinates
* @param {string} [format] - Output format ('coordinates', 'json', 'csv')
* @returns {Object|string} Visualization data
*/
exportVisualization(format = 'coordinates') {
const visualCoords = this.topology.getVisualizationCoordinates('cartesian')
const nodeMappings = this.getNodeMappings()
const visualizationData = visualCoords.map(coord => {
// Find entity assigned to this node
const assignedEntity = nodeMappings.find(mapping => mapping.nodeIndex === coord.index)
return {
nodeIndex: coord.index,
mapCoords: coord.mapCoords,
visualCoords: coord.visualCoords,
entity: assignedEntity ? {
uri: assignedEntity.metadata.uri,
content: assignedEntity.metadata.content,
type: assignedEntity.metadata.type
} : null,
weights: this.core.getNodeWeights(coord.index)
}
})
switch (format) {
case 'json':
return JSON.stringify(visualizationData, null, 2)
case 'csv':
return this.convertToCSV(visualizationData)
case 'coordinates':
default:
return visualizationData
}
}
/**
* Integrate with Hyde algorithm results
* @param {Object} hydeResults - Results from Hyde algorithm
* @returns {Object} Integration results
*/
async integrateWithHyde(hydeResults) {
logger.info('Integrating VSOM with Hyde results')
// Separate hypothetical entities from factual ones
const hypotheticalEntities = hydeResults.entities.filter(entity =>
entity.metadata && entity.metadata.hypothetical
)
// Create separate clusters for hypothetical content
const hypotheticalClusters = await this.clusterHypotheticalEntities(hypotheticalEntities)
return {
hypotheticalClusters: hypotheticalClusters,
totalHypotheticalEntities: hypotheticalEntities.length,
confidenceDistribution: this.analyzeConfidenceDistribution(hypotheticalEntities)
}
}
/**
* Integrate with GraphAnalytics results
* @param {Object} graphResults - Results from GraphAnalytics
* @returns {Object} Integration results
*/
integrateWithGraphAnalytics(graphResults) {
logger.info('Integrating VSOM with GraphAnalytics results')
// Use centrality measures to weight entity importance in clustering
const enhancedClusters = this.enhanceClustersWithCentrality(graphResults)
return {
enhancedClusters: enhancedClusters,
centralityWeighting: true
}
}
// Helper methods
/**
* Extract entity data from various entity formats
* @param {Object} entity - Entity object
* @returns {Object} Extracted entity data
*/
extractEntityData(entity) {
// Handle different entity formats
if (entity.getPrefLabel && typeof entity.getPrefLabel === 'function') {
// Ragno Entity object
return {
uri: entity.uri,
content: entity.getPrefLabel() || entity.content || '',
type: entity.getSubType() || 'entity',
metadata: entity.metadata || {}
}
} else if (entity.uri && entity.content) {
// Plain object with uri and content
return {
uri: entity.uri,
content: entity.content,
type: entity.type || 'entity',
metadata: entity.metadata || {}
}
} else if (typeof entity === 'string') {
// String content
const uri = this.namespaces.ex(`entity_${Date.now()}_${Math.random()}`)
return {
uri: uri.value,
content: entity,
type: 'text',
metadata: {}
}
} else {
throw new Error(`Unsupported entity format: ${typeof entity}`)
}
}
/**
* Execute SPARQL query (placeholder implementation)
* @param {string} endpoint - SPARQL endpoint URL
* @param {string} query - SPARQL query
* @param {Object} options - Query options
* @returns {Promise<Array>} Query results
*/
async executeSPARQLQuery(endpoint, query, options) {
// This would integrate with the existing SPARQL infrastructure
// For now, return empty results
logger.warn('SPARQL query execution not implemented yet')
return []
}
/**
* Process SPARQL results into entity format
* @param {Array} sparqlResults - SPARQL query results
* @returns {Array} Processed entities
*/
processSPARQLResults(sparqlResults) {
return sparqlResults.map(result => ({
uri: result.entity?.value || '',
content: result.label?.value || result.content?.value || '',
type: result.type?.value || 'entity',
metadata: {
fromSPARQL: true,
sparqlResult: result
}
}))
}
/**
* Apply filters to entity data
* @param {Array} entities - Array of entities
* @param {Object} filters - Filter criteria
* @returns {Array} Filtered entities
*/
applyEntityFilters(entities, filters) {
return entities.filter(entity => {
for (const [key, value] of Object.entries(filters)) {
if (entity[key] !== value) {
return false
}
}
return true
})
}
/**
* Generate node assignments for entities
*/
generateNodeAssignments() {
this.nodeAssignments = this.embeddings.map(embedding => {
const bmuIndex = this.core.findSingleBMU(embedding)
const distance = this.core.calculateDistance(embedding, this.core.getNodeWeights(bmuIndex))
return {
nodeIndex: bmuIndex,
distance: distance
}
})
}
/**
* Generate clusters from trained map
* @param {number} threshold - Clustering threshold
* @returns {Array} Array of clusters
*/
generateClusters(threshold) {
// Simple clustering based on weight similarity
const clusters = []
const visited = new Set()
for (let i = 0; i < this.core.totalNodes; i++) {
if (visited.has(i)) continue
const cluster = this.expandCluster(i, threshold, visited)
if (cluster.members.length >= this.options.minClusterSize) {
clusters.push(cluster)
}
}
return clusters
}
/**
* Expand cluster using neighboring nodes
* @param {number} seedIndex - Starting node index
* @param {number} threshold - Similarity threshold
* @param {Set} visited - Set of visited nodes
* @returns {Object} Cluster object
*/
expandCluster(seedIndex, threshold, visited) {
const cluster = {
id: seedIndex,
members: [seedIndex],
centroid: [...this.core.getNodeWeights(seedIndex)]
}
visited.add(seedIndex)
const queue = [seedIndex]
while (queue.length > 0) {
const currentIndex = queue.shift()
const currentCoords = this.topology.indexToCoordinates(currentIndex)
// Check neighboring nodes
const neighbors = this.topology.getNeighbors(currentCoords, 1.5)
for (const neighbor of neighbors) {
const neighborIndex = this.topology.coordinatesToIndex(...neighbor.coords)
if (!visited.has(neighborIndex)) {
const similarity = this.calculateNodeSimilarity(currentIndex, neighborIndex)
if (similarity > threshold) {
cluster.members.push(neighborIndex)
visited.add(neighborIndex)
queue.push(neighborIndex)
}
}
}
}
// Recalculate centroid
if (cluster.members.length > 1) {
cluster.centroid = this.calculateClusterCentroid(cluster.members)
}
return cluster
}
/**
* Calculate similarity between two nodes
* @param {number} index1 - First node index
* @param {number} index2 - Second node index
* @returns {number} Similarity score
*/
calculateNodeSimilarity(index1, index2) {
const weights1 = this.core.getNodeWeights(index1)
const weights2 = this.core.getNodeWeights(index2)
const distance = this.core.calculateDistance(weights1, weights2)
// Convert distance to similarity (0-1 scale)
return Math.max(0, 1 - distance)
}
/**
* Calculate cluster centroid
* @param {Array} memberIndices - Array of member node indices
* @returns {Array} Centroid vector
*/
calculateClusterCentroid(memberIndices) {
const centroid = new Array(this.options.embeddingDimension).fill(0)
for (const index of memberIndices) {
const weights = this.core.getNodeWeights(index)
for (let i = 0; i < weights.length; i++) {
centroid[i] += weights[i]
}
}
for (let i = 0; i < centroid.length; i++) {
centroid[i] /= memberIndices.length
}
return centroid
}
/**
* Find which cluster an entity belongs to
* @param {number} entityIndex - Entity index
* @param {Array} clusters - Array of clusters
* @returns {number} Cluster index or -1 if not found
*/
findEntityCluster(entityIndex, clusters) {
if (!this.nodeAssignments || !this.nodeAssignments[entityIndex]) {
return -1
}
const nodeIndex = this.nodeAssignments[entityIndex].nodeIndex
for (let i = 0; i < clusters.length; i++) {
if (clusters[i].members.includes(nodeIndex)) {
return i
}
}
return -1
}
/**
* Cluster hypothetical entities separately
* @param {Array} hypotheticalEntities - Array of hypothetical entities
* @returns {Promise<Array>} Hypothetical clusters
*/
async clusterHypotheticalEntities(hypotheticalEntities) {
// Placeholder implementation
logger.info(`Clustering ${hypotheticalEntities.length} hypothetical entities`)
return []
}
/**
* Analyze confidence distribution
* @param {Array} entities - Array of entities with confidence scores
* @returns {Object} Confidence analysis
*/
analyzeConfidenceDistribution(entities) {
const confidences = entities
.map(entity => entity.metadata?.confidence || 0)
.filter(conf => conf > 0)
if (confidences.length === 0) {
return { mean: 0, std: 0, min: 0, max: 0 }
}
const mean = confidences.reduce((sum, conf) => sum + conf, 0) / confidences.length
const variance = confidences.reduce((sum, conf) => sum + Math.pow(conf - mean, 2), 0) / confidences.length
return {
mean: mean,
std: Math.sqrt(variance),
min: Math.min(...confidences),
max: Math.max(...confidences),
count: confidences.length
}
}
/**
* Enhance clusters with centrality measures
* @param {Object} graphResults - Graph analytics results
* @returns {Array} Enhanced clusters
*/
enhanceClustersWithCentrality(graphResults) {
// Placeholder implementation
logger.info('Enhancing clusters with centrality measures')
return this.clusters || []
}
/**
* Convert data to CSV format
* @param {Array} data - Data to convert
* @returns {string} CSV string
*/
convertToCSV(data) {
if (data.length === 0) return ''
const headers = Object.keys(data[0])
const csvHeaders = headers.join(',')
const csvRows = data.map(row =>
headers.map(header => {
const value = row[header]
return typeof value === 'object' ? JSON.stringify(value) : value
}).join(',')
)
return [csvHeaders, ...csvRows].join('\n')
}
/**
* Get algorithm statistics
* @returns {Object} VSOM statistics
*/
getStatistics() {
return {
...this.stats,
trained: this.trained,
mapSize: this.options.mapSize,
totalNodes: this.topology.totalNodes,
embeddingDimension: this.options.embeddingDimension,
core: this.core.getStatistics(),
topology: this.topology.getTopologyInfo(),
training: this.training.getStatistics(),
memoryUsage: this.estimateMemoryUsage()
}
}
/**
* Estimate total memory usage
* @returns {number} Estimated memory usage in bytes
*/
estimateMemoryUsage() {
const coreMemory = this.core.estimateMemoryUsage()
const topologyMemory = this.topology.estimateMemoryUsage()
const trainingMemory = this.training.estimateMemoryUsage()
const dataMemory = this.embeddings.length * this.options.embeddingDimension * 8 // Float64
return coreMemory + topologyMemory + trainingMemory + dataMemory
}
/**
* Reset VSOM state
*/
reset() {
this.entities = []
this.embeddings = []
this.entityMetadata = []
this.trained = false
this.clusters = null
this.nodeAssignments = null
this.trainingResults = null
this.training.reset()
this.stats = {
totalEntities: 0,
totalClusters: 0,
trainingTime: 0,
lastTrainingDate: null,
dataLoadTime: 0,
lastDataLoadDate: null
}
logger.debug('VSOM state reset')
}
}