/**
* Ragno: Community Detection and Aggregation - RDF-Ext Version
*
* This module uses advanced Leiden clustering to detect communities in the knowledge
* graph and generates comprehensive community summaries as CommunityElement RDF resources.
* It integrates with the ragno search system for community-based retrieval.
*/
import rdf from 'rdf-ext'
import Attribute from './Attribute.js'
import RDFGraphManager from './core/RDFGraphManager.js'
import NamespaceManager from './core/NamespaceManager.js'
import { CommunityDetection } from './algorithms/index.js'
import { logger } from '../Utils.js'
/**
* Community Element class representing ragno:CommunityElement
*/
class CommunityElement {
constructor(rdfManager, options = {}) {
this.rdfManager = rdfManager
this.ns = rdfManager.getNamespaceManager()
this.id = options.id || `community_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`
this.uri = this.ns.createURI('ragno', this.id)
// Core properties
this.members = options.members || []
this.summary = options.summary || ''
this.confidence = options.confidence || 0.5
this.modularityScore = options.modularityScore || 0.0
this.cohesionScore = options.cohesionScore || 0.0
this.keywords = options.keywords || []
this.provenance = options.provenance || 'Leiden community detection'
// Initialize as RDF resource
this._initializeRDF()
}
_initializeRDF() {
const quad = this.rdfManager.createQuad
this.dataset = rdf.dataset()
// Type declaration
this.dataset.add(quad(
this.uri,
this.ns.rdf.type,
this.ns.ragno('CommunityElement')
))
// Add as SKOS Concept
this.dataset.add(quad(
this.uri,
this.ns.rdf.type,
this.ns.skos.Concept
))
// Core properties
this.dataset.add(quad(
this.uri,
this.ns.ragno('content'),
this.rdfManager.createLiteral(this.summary)
))
this.dataset.add(quad(
this.uri,
this.ns.ragno('hasConfidence'),
this.rdfManager.createLiteral(this.confidence, this.ns.xsd.float)
))
this.dataset.add(quad(
this.uri,
this.ns.ragno('modularityScore'),
this.rdfManager.createLiteral(this.modularityScore, this.ns.xsd.float)
))
this.dataset.add(quad(
this.uri,
this.ns.ragno('cohesionScore'),
this.rdfManager.createLiteral(this.cohesionScore, this.ns.xsd.float)
))
// Add member entities
for (const memberUri of this.members) {
this.dataset.add(quad(
this.uri,
this.ns.ragno('hasCommunityMember'),
memberUri
))
}
// Add keywords
for (const keyword of this.keywords) {
this.dataset.add(quad(
this.uri,
this.ns.ragno('hasKeyword'),
this.rdfManager.createLiteral(keyword)
))
}
// Provenance
this.dataset.add(quad(
this.uri,
this.ns.ragno('provenance'),
this.rdfManager.createLiteral(this.provenance)
))
// Timestamp
this.dataset.add(quad(
this.uri,
this.ns.ragno('timestamp'),
this.rdfManager.createLiteral(new Date().toISOString(), this.ns.xsd.dateTime)
))
}
// Accessor methods
getURI() { return this.uri }
getMembers() { return this.members }
getSummary() { return this.summary }
getConfidence() { return this.confidence }
getModularityScore() { return this.modularityScore }
getCohesionScore() { return this.cohesionScore }
getKeywords() { return this.keywords }
// Export to external dataset
exportToDataset(targetDataset) {
for (const quad of this.dataset) {
targetDataset.add(quad)
}
}
// Create overview attribute for this community
createOverviewAttribute(rdfManager) {
return Attribute.createOverviewAttribute(rdfManager, {
entityURI: this.uri,
summary: this.summary,
confidence: this.confidence,
keywords: this.keywords,
provenance: `Community overview: ${this.provenance}`
})
}
}
/**
* Detect communities and generate comprehensive summaries using Leiden clustering
* @param {Object} graphData - Graph data with RDF dataset
* @param {Object} llmHandler - LLM handler instance
* @param {Object} [options] - Community detection options
* @returns {Promise<{communities: CommunityElement[], attributes: Attribute[], dataset: Dataset, statistics: Object}>}
*/
export async function aggregateCommunities(graphData, llmHandler, options = {}) {
const startTime = Date.now()
logger.info('Starting community detection and aggregation...')
const opts = {
// Leiden algorithm parameters
resolution: options.resolution || 1.0,
minCommunitySize: options.minCommunitySize || 3,
maxIterations: options.maxIterations || 100,
randomSeed: options.randomSeed || 42,
// Summary generation
generateSummaries: options.generateSummaries !== false,
maxSummaryLength: options.maxSummaryLength || 300,
includeKeywords: options.includeKeywords !== false,
// Quality control
minModularityScore: options.minModularityScore || 0.1,
minCohesionScore: options.minCohesionScore || 0.3,
...options
}
// Initialize RDF infrastructure
const namespaceManager = new NamespaceManager()
const rdfManager = new RDFGraphManager({ namespace: namespaceManager })
const resultDataset = rdf.dataset()
// Copy existing dataset
if (graphData.dataset) {
for (const quad of graphData.dataset) {
resultDataset.add(quad)
}
}
try {
// Phase 1: Run Leiden community detection
const communityDetection = new CommunityDetection()
const graph = await communityDetection.buildGraphFromRDF(graphData.dataset)
if (graph.nodes.size < opts.minCommunitySize) {
logger.warn('Graph too small for meaningful community detection')
return {
communities: [],
attributes: [],
dataset: resultDataset,
statistics: {
processingTime: Date.now() - startTime,
communitiesDetected: 0,
nodesProcessed: graph.nodes.size
}
}
}
logger.info(`Running Leiden clustering on graph with ${graph.nodes.size} nodes and ${graph.edges.size} edges`)
const clusteringResults = communityDetection.computeLeidenClustering(graph, {
resolution: opts.resolution,
maxIterations: opts.maxIterations,
randomSeed: opts.randomSeed
})
logger.info(`Detected ${clusteringResults.communities.length} communities with modularity: ${clusteringResults.modularity.toFixed(3)}`)
// Phase 2: Filter and process communities
const validCommunities = clusteringResults.communities.filter(community =>
community.members.length >= opts.minCommunitySize &&
(clusteringResults.modularityScores?.get(community.id) || 0) >= opts.minModularityScore
)
logger.info(`${validCommunities.length} communities meet quality thresholds`)
// Phase 3: Generate comprehensive summaries for each community
const communityElements = []
const attributes = []
for (const community of validCommunities) {
logger.debug(`Processing community ${community.id} with ${community.members.length} members`)
// Gather community context
const communityContext = await gatherCommunityContext(
community,
graphData,
opts
)
// Generate LLM summary
let summary = ''
let keywords = []
let confidence = 0.5
if (opts.generateSummaries && communityContext.contextText) {
const summaryData = await generateCommunitySummary(
community,
communityContext,
llmHandler,
opts
)
if (summaryData) {
summary = summaryData.summary
keywords = summaryData.keywords
confidence = summaryData.confidence
}
}
// Calculate community cohesion score
const cohesionScore = calculateCommunityCohesion(community, graph)
// Create CommunityElement
const communityElement = new CommunityElement(rdfManager, {
id: `community_${community.id}`,
members: community.members,
summary: summary,
confidence: confidence,
modularityScore: clusteringResults.modularityScores?.get(community.id) || 0,
cohesionScore: cohesionScore,
keywords: keywords,
provenance: `Leiden clustering (resolution=${opts.resolution})`
})
communityElements.push(communityElement)
communityElement.exportToDataset(resultDataset)
// Create overview attribute for searchability
if (summary) {
const overviewAttribute = communityElement.createOverviewAttribute(rdfManager)
attributes.push(overviewAttribute)
overviewAttribute.exportToDataset(resultDataset)
}
logger.debug(`Community ${community.id}: ${summary.length} char summary, ${keywords.length} keywords, cohesion: ${cohesionScore.toFixed(3)}`)
}
// Phase 4: Create inter-community relationships
await createInterCommunityRelationships(communityElements, resultDataset, rdfManager, graph)
const processingTime = Date.now() - startTime
logger.info(`Community aggregation completed in ${processingTime}ms: ${communityElements.length} communities, ${attributes.length} attributes`)
return {
communities: communityElements,
attributes: attributes,
dataset: resultDataset,
statistics: {
processingTime,
communitiesDetected: validCommunities.length,
totalCommunities: clusteringResults.communities.length,
overallModularity: clusteringResults.modularity,
averageCommunitySize: validCommunities.reduce((sum, c) => sum + c.members.length, 0) / validCommunities.length,
nodesProcessed: graph.nodes.size,
edgesProcessed: graph.edges.size,
attributesGenerated: attributes.length
}
}
} catch (error) {
logger.error('Community aggregation failed:', error)
throw error
}
}
/**
* Gather comprehensive context for a community
* @param {Object} community - Community object with members
* @param {Object} graphData - Graph data
* @param {Object} options - Context gathering options
* @returns {Promise<Object>} Community context object
*/
async function gatherCommunityContext(community, graphData, options) {
const context = {
community: community,
memberEntities: [],
units: [],
relationships: [],
contextText: '',
evidence: []
}
// Get member entity objects
if (graphData.entities) {
for (const entity of graphData.entities) {
if (community.members.includes(entity.getURI().value)) {
context.memberEntities.push(entity)
}
}
}
// Gather units that mention community entities
if (graphData.units) {
const memberLabels = context.memberEntities.map(e => e.getPreferredLabel().toLowerCase())
for (const unit of graphData.units) {
const unitContent = unit.getContent().toLowerCase()
const mentionsMembers = memberLabels.some(label => unitContent.includes(label))
if (mentionsMembers) {
context.units.push(unit)
context.evidence.push(unit.getURI())
if (context.contextText.length < options.maxSummaryLength * 3) {
context.contextText += unit.getContent() + '\n\n'
}
}
}
}
// Gather relationships within the community
if (graphData.relationships) {
for (const relationship of graphData.relationships) {
const sourceInCommunity = community.members.includes(relationship.getSourceEntity().value)
const targetInCommunity = community.members.includes(relationship.getTargetEntity().value)
if (sourceInCommunity && targetInCommunity) {
context.relationships.push(relationship)
context.evidence.push(relationship.getURI())
}
}
}
// Trim context if too long
if (context.contextText.length > options.maxSummaryLength * 3) {
context.contextText = context.contextText.substring(0, options.maxSummaryLength * 3) + '...'
}
logger.debug(`Community ${community.id} context: ${context.memberEntities.length} entities, ${context.units.length} units, ${context.relationships.length} relationships`)
return context
}
/**
* Generate LLM summary for a community
* @param {Object} community - Community object
* @param {Object} context - Community context
* @param {Object} llmHandler - LLM handler
* @param {Object} options - Generation options
* @returns {Promise<Object>} Summary data with keywords and confidence
*/
async function generateCommunitySummary(community, context, llmHandler, options) {
const memberNames = context.memberEntities.map(e => e.getPreferredLabel()).join(', ')
const prompt = `Analyze this community of related entities and provide a comprehensive summary of their shared theme, domain, or context.
Community Members: ${memberNames}
Context Information:
${context.contextText}
Based on the relationships and context, write a 2-3 sentence summary that captures:
1. The main theme or domain that unites these entities
2. The key relationships or patterns within the community
3. The significance or importance of this grouping
Summary:`
try {
const response = await llmHandler.generateResponse(prompt, '', {
maxTokens: 150,
temperature: 0.1
})
const summary = response.trim()
if (summary.length < 20) {
logger.debug(`Generated community summary too short: ${summary.length} chars`)
return null
}
// Extract keywords from summary and member names
const keywords = extractCommunityKeywords(summary, context.memberEntities)
// Calculate confidence based on context quality
const confidence = calculateSummaryConfidence(context, summary)
return {
summary: summary,
keywords: keywords,
confidence: confidence
}
} catch (error) {
logger.warn(`Failed to generate community summary:`, error.message)
// Fallback: create simple summary from member names
const fallbackSummary = `Community of related entities including ${memberNames.slice(0, 3).join(', ')}${memberNames.length > 3 ? ' and others' : ''}.`
return {
summary: fallbackSummary,
keywords: context.memberEntities.slice(0, 3).map(e => e.getPreferredLabel()),
confidence: 0.3
}
}
}
/**
* Extract keywords from community summary and members
* @param {string} summary - Generated summary
* @param {Array} memberEntities - Community member entities
* @returns {Array<string>} Extracted keywords
*/
function extractCommunityKeywords(summary, memberEntities) {
const keywords = new Set()
// Add member entity names as keywords
for (const entity of memberEntities.slice(0, 5)) {
keywords.add(entity.getPreferredLabel())
}
// Extract significant words from summary
const summaryWords = summary.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter(word => word.length > 3 && !isStopWord(word))
for (const word of summaryWords.slice(0, 3)) {
keywords.add(word)
}
return Array.from(keywords).slice(0, 8)
}
/**
* Check if word is a stop word
* @param {string} word - Word to check
* @returns {boolean} True if stop word
*/
function isStopWord(word) {
const stopWords = new Set([
'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'this', 'that', 'these', 'those', 'is', 'are', 'was', 'were', 'be', 'been',
'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may', 'might',
'community', 'entities', 'related', 'group', 'members'
])
return stopWords.has(word)
}
/**
* Calculate confidence for generated summary
* @param {Object} context - Community context
* @param {string} summary - Generated summary
* @returns {number} Confidence score 0-1
*/
function calculateSummaryConfidence(context, summary) {
let confidence = 0.3 // Base confidence
// Factor in community size
if (context.memberEntities.length > 2) {
confidence += Math.min(context.memberEntities.length * 0.1, 0.3)
}
// Factor in context richness
if (context.units.length > 0) {
confidence += Math.min(context.units.length * 0.05, 0.2)
}
if (context.relationships.length > 0) {
confidence += Math.min(context.relationships.length * 0.05, 0.15)
}
// Factor in summary quality
if (summary.length > 100) {
confidence += 0.1
}
return Math.min(confidence, 1.0)
}
/**
* Calculate cohesion score for a community
* @param {Object} community - Community object
* @param {Object} graph - Graph object
* @returns {number} Cohesion score 0-1
*/
function calculateCommunityCohesion(community, graph) {
const members = new Set(community.members)
let internalEdges = 0
let totalPossibleEdges = 0
// Count internal edges vs total possible
for (const member of members) {
const memberEdges = graph.adjacencyList.get(member) || new Set()
for (const neighbor of memberEdges) {
if (members.has(neighbor) && member < neighbor) { // Avoid double counting
internalEdges++
}
}
}
totalPossibleEdges = (members.size * (members.size - 1)) / 2
return totalPossibleEdges > 0 ? internalEdges / totalPossibleEdges : 0
}
/**
* Create relationships between overlapping communities
* @param {Array<CommunityElement>} communities - Community elements
* @param {Dataset} dataset - RDF dataset
* @param {RDFGraphManager} rdfManager - RDF manager
* @param {Object} graph - Graph object
*/
async function createInterCommunityRelationships(communities, dataset, rdfManager, graph) {
logger.debug('Creating inter-community relationships...')
const Relationship = (await import('./Relationship.js')).default
let relationshipCount = 0
// Find overlapping or connected communities
for (let i = 0; i < communities.length; i++) {
for (let j = i + 1; j < communities.length; j++) {
const comm1 = communities[i]
const comm2 = communities[j]
// Check for shared members (overlap)
const sharedMembers = comm1.getMembers().filter(member =>
comm2.getMembers().includes(member)
)
if (sharedMembers.length > 0) {
// Create overlap relationship
const relationship = new Relationship(rdfManager, {
id: `comm_overlap_${i}_${j}`,
sourceEntity: comm1.getURI(),
targetEntity: comm2.getURI(),
relationshipType: 'overlaps',
content: `Communities share ${sharedMembers.length} member(s)`,
weight: sharedMembers.length / Math.min(comm1.getMembers().length, comm2.getMembers().length),
bidirectional: true
})
relationship.exportToDataset(dataset)
relationshipCount++
continue
}
// Check for inter-community connections
let connectionCount = 0
for (const member1 of comm1.getMembers()) {
const memberEdges = graph.adjacencyList.get(member1) || new Set()
for (const member2 of comm2.getMembers()) {
if (memberEdges.has(member2)) {
connectionCount++
}
}
}
if (connectionCount > 0) {
// Create connection relationship
const connectionStrength = connectionCount / (comm1.getMembers().length + comm2.getMembers().length)
if (connectionStrength > 0.1) { // Only create if significant connection
const relationship = new Relationship(rdfManager, {
id: `comm_connected_${i}_${j}`,
sourceEntity: comm1.getURI(),
targetEntity: comm2.getURI(),
relationshipType: 'connected_to',
content: `Communities connected by ${connectionCount} inter-community edge(s)`,
weight: connectionStrength,
bidirectional: true
})
relationship.exportToDataset(dataset)
relationshipCount++
}
}
}
}
logger.debug(`Created ${relationshipCount} inter-community relationships`)
}