/**
* SemanticUnit.js - RDF-based Semantic Unit implementation for Ragno
*
* This class represents ragno:Unit as an RDF resource following the ragno ontology
* specification. Semantic units are independent textual segments that represent
* coherent events, facts, or concepts extracted from larger documents.
*
* Key Features:
* - First-class RDF resource following ragno:Unit
* - SKOS Concept compliance for semantic interoperability
* - Source document tracking and provenance
* - Connection management with entities and other units
* - Summary and full text content management
* - Vector embedding integration for similarity search
*/
import rdf from 'rdf-ext'
import RDFElement from './models/RDFElement.js'
import { logger } from '../Utils.js'
export default class SemanticUnit extends RDFElement {
constructor(options = {}) {
// Initialize with unit type
super({
...options,
type: 'unit'
})
// Add ragno:Unit type
this.addType(this.ns.classes.Unit)
// Set text content if provided
if (options.text || options.content) {
const content = options.text || options.content
this.setContent(content)
}
// Set summary if provided (as SKOS definition)
if (options.summary) {
this.setSummary(options.summary)
}
// Set source document if provided
if (options.source || options.sourceDocument) {
this.setSourceDocument(options.source || options.sourceDocument)
}
// Set sub-type if provided (e.g., "Fact", "Event", "Description")
if (options.subType) {
this.setSubType(options.subType)
}
// Units are typically not entry points (entities are)
this.setEntryPoint(options.isEntryPoint !== undefined ? options.isEntryPoint : false)
// Set position in source if provided
if (options.position !== undefined) {
this.setPosition(options.position)
}
// Set length if provided
if (options.length !== undefined) {
this.setLength(options.length)
}
// Set custom metadata if provided
if (options.metadata && typeof options.metadata === 'object') {
this.setAllMetadata(options.metadata)
}
logger.debug(`Created ragno:Unit: ${this.uri}`)
}
/**
* Set the main text content for this unit
* @param {string} text - Text content
*/
setText(text) {
this.setContent(text)
}
/**
* Get the main text content of this unit
* @returns {string|null} Text content
*/
getText() {
return this.getContent()
}
/**
* Set summary for this unit (stored as SKOS definition)
* @param {string} summary - Summary text
* @param {string} [lang='en'] - Language tag
*/
setSummary(summary, lang = 'en') {
this.removeTriple(this.ns.skosProperties.definition)
this.addTriple(this.ns.skosProperties.definition, rdf.literal(summary, lang))
}
/**
* Get summary for this unit
* @returns {string|null} Summary text
*/
getSummary() {
const quads = this.getTriplesWithPredicate(this.ns.skosProperties.definition)
return quads.length > 0 ? quads[0].object.value : null
}
/**
* Set source document for this unit
* @param {string|NamedNode} source - Source document URI or node
*/
setSourceDocument(source) {
this.removeTriple(this.ns.properties.hasSourceDocument)
const sourceNode = typeof source === 'string' ? rdf.namedNode(source) : source
this.addTriple(this.ns.properties.hasSourceDocument, sourceNode)
}
/**
* Get source document for this unit
* @returns {NamedNode|null} Source document node
*/
getSourceDocument() {
const quads = this.getTriplesWithPredicate(this.ns.properties.hasSourceDocument)
return quads.length > 0 ? quads[0].object : null
}
/**
* Set position in source document
* @param {number} position - Character position
*/
setPosition(position) {
this.removeTriple(this.ns.ex('position'))
this.addTriple(this.ns.ex('position'), rdf.literal(position))
}
/**
* Get position in source document
* @returns {number|null} Character position
*/
getPosition() {
const quads = this.getTriplesWithPredicate(this.ns.ex('position'))
return quads.length > 0 ? parseInt(quads[0].object.value) : null
}
/**
* Set length of this unit in characters
* @param {number} length - Character length
*/
setLength(length) {
this.removeTriple(this.ns.ex('length'))
this.addTriple(this.ns.ex('length'), rdf.literal(length))
}
/**
* Get length of this unit in characters
* @returns {number|null} Character length
*/
getLength() {
const quads = this.getTriplesWithPredicate(this.ns.ex('length'))
return quads.length > 0 ? parseInt(quads[0].object.value) : null
}
/**
* Set vector embedding for this unit
* @param {Array<number>} embedding - Vector embedding
*/
setEmbedding(embedding) {
this.removeTriple(this.ns.ex('embedding'))
// Store embedding as JSON string for now
const embeddingStr = JSON.stringify(embedding)
this.addTriple(this.ns.ex('embedding'), rdf.literal(embeddingStr))
}
/**
* Get vector embedding for this unit
* @returns {Array<number>|null} Vector embedding
*/
getEmbedding() {
const quads = this.getTriplesWithPredicate(this.ns.ex('embedding'))
if (quads.length > 0) {
try {
return JSON.parse(quads[0].object.value)
} catch (error) {
logger.warn(`Failed to parse embedding for unit ${this.uri}:`, error)
return null
}
}
return null
}
/**
* Add a connection to an entity that this unit mentions
* @param {Entity|NamedNode|string} entity - Entity reference
* @param {number} [relevanceScore] - Relevance score (0-1)
*/
addEntityConnection(entity, relevanceScore) {
const entityNode = this._normalizeEntityReference(entity)
this.connectTo(entityNode, relevanceScore)
// Also add specific property for entity connections
this.addTriple(this.ns.ex('mentionsEntity'), entityNode)
if (relevanceScore !== undefined) {
// Create reified statement for relevance score
const connection = rdf.namedNode(`${this.uri}/entityConnection/${Date.now()}`)
this.dataset.add(rdf.quad(connection, this.ns.rdf.subject, this.node))
this.dataset.add(rdf.quad(connection, this.ns.rdf.predicate, this.ns.ex('mentionsEntity')))
this.dataset.add(rdf.quad(connection, this.ns.rdf.object, entityNode))
this.dataset.add(rdf.quad(connection, this.ns.ex('relevanceScore'), rdf.literal(relevanceScore)))
}
}
/**
* Get all entities mentioned by this unit
* @returns {Array<NamedNode>} Entity nodes
*/
getMentionedEntities() {
return this.getTriplesWithPredicate(this.ns.ex('mentionsEntity'))
.map(quad => quad.object)
}
/**
* Add a connection to another semantic unit
* @param {SemanticUnit|NamedNode|string} unit - Unit reference
* @param {string} [relationType] - Type of relationship
* @param {number} [weight] - Connection weight
*/
addUnitConnection(unit, relationType, weight) {
const unitNode = this._normalizeUnitReference(unit)
this.connectTo(unitNode, weight)
if (relationType) {
// Add typed connection
const connection = rdf.namedNode(`${this.uri}/unitConnection/${Date.now()}`)
this.dataset.add(rdf.quad(connection, this.ns.rdf.subject, this.node))
this.dataset.add(rdf.quad(connection, this.ns.rdf.predicate, this.ns.properties.connectsTo))
this.dataset.add(rdf.quad(connection, this.ns.rdf.object, unitNode))
this.dataset.add(rdf.quad(connection, this.ns.ex('relationType'), rdf.literal(relationType)))
if (weight !== undefined) {
this.dataset.add(rdf.quad(connection, this.ns.properties.hasWeight, rdf.literal(weight)))
}
}
}
/**
* Get all connected semantic units
* @returns {Array<Object>} Connected units with relationship info
*/
getConnectedUnits() {
const connections = this.getConnectedElements()
// Filter for units and get relationship info
const unitConnections = []
for (const connection of connections) {
// Check if this is a unit (would need broader dataset query in practice)
unitConnections.push({
unit: connection,
type: 'connectsTo'
})
}
return unitConnections
}
/**
* Set corpus association for this unit
* @param {string|NamedNode} corpus - Corpus URI or node
*/
setCorpus(corpus) {
this.removeTriple(this.ns.properties.inCorpus)
const corpusNode = typeof corpus === 'string' ? rdf.namedNode(corpus) : corpus
this.addTriple(this.ns.properties.inCorpus, corpusNode)
}
/**
* Get corpus association for this unit
* @returns {NamedNode|null} Corpus node
*/
getCorpus() {
const quads = this.getTriplesWithPredicate(this.ns.properties.inCorpus)
return quads.length > 0 ? quads[0].object : null
}
/**
* Set language for this unit
* @param {string} language - Language code (e.g., 'en', 'es')
*/
setLanguage(language) {
this.removeTriple(this.ns.dcProperties.language)
this.addTriple(this.ns.dcProperties.language, rdf.literal(language))
}
/**
* Get language for this unit
* @returns {string|null} Language code
*/
getLanguage() {
const quads = this.getTriplesWithPredicate(this.ns.dcProperties.language)
return quads.length > 0 ? quads[0].object.value : null
}
/**
* Add an entity mention to this semantic unit as an RDF triple
* @param {string} entityURI - The URI of the mentioned entity
* @param {number} [relevance=1.0] - Optional relevance/confidence score
*/
addEntityMention(entityURI, relevance = 1.0) {
// Add ragno:mention triple
const mentionPredicate = this.ns.properties.mention || this.ns.ex('mention') || rdf.namedNode('http://hyperdata.it/ontologies/ragno#mention');
this.addTriple(mentionPredicate, rdf.namedNode(entityURI));
// Optionally, add a relevance/confidence triple (custom property)
if (relevance !== undefined && !isNaN(relevance)) {
const relPredicate = this.ns.properties.mentionRelevance || this.ns.ex('mentionRelevance') || rdf.namedNode('http://hyperdata.it/ontologies/ragno#mentionRelevance');
this.addTriple(relPredicate, rdf.literal(relevance.toString(), this.ns.xsd.double));
}
}
/**
* Normalize different entity reference formats to NamedNode
* @private
* @param {Entity|NamedNode|string} entity - Entity reference
* @returns {NamedNode} Normalized entity node
*/
_normalizeEntityReference(entity) {
if (typeof entity === 'string') {
return rdf.namedNode(entity)
} else if (entity && typeof entity === 'object' && entity.node) {
// Entity instance
return entity.node
} else if (entity && entity.termType === 'NamedNode') {
// Already a NamedNode
return entity
} else {
throw new Error(`Invalid entity reference: ${entity}`)
}
}
/**
* Normalize different unit reference formats to NamedNode
* @private
* @param {SemanticUnit|NamedNode|string} unit - Unit reference
* @returns {NamedNode} Normalized unit node
*/
_normalizeUnitReference(unit) {
if (typeof unit === 'string') {
return rdf.namedNode(unit)
} else if (unit && typeof unit === 'object' && unit.node) {
// SemanticUnit instance
return unit.node
} else if (unit && unit.termType === 'NamedNode') {
// Already a NamedNode
return unit
} else {
throw new Error(`Invalid unit reference: ${unit}`)
}
}
/**
* Validate this unit according to ragno ontology
* @returns {Object} Validation result
*/
validate() {
const baseValidation = super.validate()
const errors = [...baseValidation.errors]
// Check ragno:Unit type
if (!this.hasType(this.ns.classes.Unit)) {
errors.push('Unit must have ragno:Unit type')
}
// Check required content
if (!this.getText()) {
errors.push('Unit must have text content')
}
// Check text length is reasonable
const text = this.getText()
if (text && text.length < 10) {
errors.push('Unit text content should be at least 10 characters')
}
return {
valid: errors.length === 0,
errors
}
}
/**
* Get unit metadata including ragno-specific properties
* @returns {Object} Unit metadata
*/
getMetadata() {
const baseMetadata = super.getMetadata()
return {
...baseMetadata,
text: this.getText(),
summary: this.getSummary(),
sourceDocument: this.getSourceDocument()?.value,
position: this.getPosition(),
length: this.getLength(),
language: this.getLanguage(),
corpus: this.getCorpus()?.value,
mentionedEntitiesCount: this.getMentionedEntities().length,
connectedUnitsCount: this.getConnectedUnits().length,
hasEmbedding: this.getEmbedding() !== null
}
}
/**
* Convert to simple object representation (for backward compatibility)
* @returns {Object} Simple object representation
*/
toSimpleObject() {
return {
uri: this.uri,
text: this.getText(),
summary: this.getSummary(),
source: this.getSourceDocument()?.value,
sourceDocument: this.getSourceDocument()?.value,
position: this.getPosition(),
length: this.getLength(),
language: this.getLanguage(),
subType: this.getSubType(),
corpus: this.getCorpus()?.value,
isEntryPoint: this.isEntryPoint()
}
}
/**
* Create unit from simple object (migration helper)
* @param {Object} obj - Simple object representation
* @param {Object} [options] - Additional options
* @returns {SemanticUnit} RDF-based unit
*/
static fromSimpleObject(obj, options = {}) {
return new SemanticUnit({
...options,
text: obj.text,
summary: obj.summary,
source: obj.source || obj.sourceDocument,
position: obj.position,
length: obj.length,
language: obj.language,
subType: obj.subType,
corpus: obj.corpus,
isEntryPoint: obj.isEntryPoint
})
}
/**
* Create a semantic unit with automatic URI generation
* @param {string} text - Unit text content
* @param {Object} [options] - Additional options
* @returns {SemanticUnit} Created unit
*/
static create(text, options = {}) {
return new SemanticUnit({
...options,
text
})
}
/**
* Clone this unit with optional modifications
* @param {Object} [modifications] - Properties to modify in the clone
* @returns {SemanticUnit} Cloned unit
*/
clone(modifications = {}) {
const cloned = new SemanticUnit({
dataset: rdf.dataset(), // New dataset for clone
text: modifications.text || this.getText(),
summary: modifications.summary || this.getSummary(),
source: modifications.source || this.getSourceDocument(),
position: modifications.position !== undefined ? modifications.position : this.getPosition(),
length: modifications.length !== undefined ? modifications.length : this.getLength(),
language: modifications.language || this.getLanguage(),
subType: modifications.subType || this.getSubType(),
corpus: modifications.corpus || this.getCorpus(),
isEntryPoint: modifications.isEntryPoint !== undefined ? modifications.isEntryPoint : this.isEntryPoint()
})
// Copy additional properties that aren't handled by constructor
for (const quad of this.getTriples()) {
// Skip properties that are handled by constructor
if (!quad.predicate.equals(this.ns.properties.content) &&
!quad.predicate.equals(this.ns.skosProperties.definition) &&
!quad.predicate.equals(this.ns.properties.hasSourceDocument) &&
!quad.predicate.equals(this.ns.ex('position')) &&
!quad.predicate.equals(this.ns.ex('length')) &&
!quad.predicate.equals(this.ns.dcProperties.language) &&
!quad.predicate.equals(this.ns.properties.subType) &&
!quad.predicate.equals(this.ns.properties.inCorpus) &&
!quad.predicate.equals(this.ns.properties.isEntryPoint) &&
!quad.predicate.equals(this.ns.dcProperties.created)) {
cloned.addTriple(quad.predicate, quad.object)
}
}
return cloned
}
}