/**
* WikidataSearch.js - Advanced entity search functionality for Wikidata
*
* This class provides sophisticated search capabilities for finding Wikidata
* entities using various search strategies: text-based, concept-based,
* Wikipedia title matching, and hierarchical exploration.
*
* Key Features:
* - Multi-strategy entity search
* - Concept-to-entity mapping
* - Wikipedia article linking
* - Instance/subclass hierarchy traversal
* - Search result ranking and filtering
*/
import logger from 'loglevel';
import WikidataConnector from './WikidataConnector.js';
export default class WikidataSearch {
constructor(options = {}) {
this.options = {
defaultLanguage: 'en',
maxResults: 10,
minConfidence: 0.3,
enableHierarchy: true,
enableImages: false,
enableCoordinates: false,
...options
};
this.connector = new WikidataConnector(options);
// Search statistics
this.stats = {
searchesByType: {
text: 0,
concept: 0,
wikipedia: 0,
hierarchy: 0
},
totalEntitiesFound: 0,
averageResultsPerSearch: 0
};
}
/**
* Search for entities by text with advanced filtering
* @param {string|Array<string>} searchTerms - Text to search for
* @param {Object} options - Search options
* @returns {Promise<Object>} Search results with ranked entities
*/
async searchByText(searchTerms, options = {}) {
const searchOptions = {
limit: this.options.maxResults,
language: this.options.defaultLanguage,
includeAliases: true,
includeDescriptions: true,
filterTypes: [], // Optional array of entity types to filter by
...options
};
this.stats.searchesByType.text++;
const termArray = Array.isArray(searchTerms) ? searchTerms : [searchTerms];
try {
// Use Wikidata's full-text search via wikibase:mwapi
const sparql = this.buildTextSearchQuery(termArray, searchOptions);
const result = await this.connector.executeQuery(sparql, options);
if (result.success) {
const entities = this.processSearchResults(result.data, searchOptions);
this.updateSearchStats(entities.length);
return {
success: true,
entities: entities,
searchTerms: termArray,
totalFound: entities.length,
searchType: 'text'
};
} else {
return {
success: false,
error: result.error,
searchTerms: termArray,
searchType: 'text'
};
}
} catch (error) {
logger.error('Text search failed:', error.message);
return {
success: false,
error: error.message,
searchTerms: termArray,
searchType: 'text'
};
}
}
/**
* Search for entities by Wikipedia article titles
* @param {string|Array<string>} titles - Wikipedia article titles
* @param {Object} options - Search options
* @returns {Promise<Object>} Matching Wikidata entities
*/
async searchByWikipediaTitle(titles, options = {}) {
const searchOptions = {
language: this.options.defaultLanguage,
exactMatch: false,
includeRedirects: true,
...options
};
this.stats.searchesByType.wikipedia++;
const titleArray = Array.isArray(titles) ? titles : [titles];
try {
const result = await this.connector.findEntitiesByWikipediaTitle(titleArray, searchOptions);
if (result.success) {
const entities = this.processWikipediaResults(result.data, searchOptions);
this.updateSearchStats(entities.length);
return {
success: true,
entities: entities,
searchTerms: titleArray,
totalFound: entities.length,
searchType: 'wikipedia'
};
} else {
return {
success: false,
error: result.error,
searchTerms: titleArray,
searchType: 'wikipedia'
};
}
} catch (error) {
logger.error('Wikipedia title search failed:', error.message);
return {
success: false,
error: error.message,
searchTerms: titleArray,
searchType: 'wikipedia'
};
}
}
/**
* Search for entities by extracted concepts
* @param {Array<Object>} concepts - Concept objects with value, type, confidence
* @param {Object} options - Search options
* @returns {Promise<Object>} Concept-to-entity mappings
*/
async searchByConcepts(concepts, options = {}) {
const searchOptions = {
minConceptConfidence: this.options.minConfidence,
maxEntitiesPerConcept: 5,
prioritizeHighConfidence: true,
...options
};
this.stats.searchesByType.concept++;
try {
const conceptResults = [];
for (const concept of concepts) {
// Skip low-confidence concepts
if (concept.confidence && concept.confidence < searchOptions.minConceptConfidence) {
continue;
}
// Search for entities matching this concept
const entitySearch = await this.searchByText(concept.value, {
limit: searchOptions.maxEntitiesPerConcept,
language: searchOptions.language || this.options.defaultLanguage
});
if (entitySearch.success && entitySearch.entities.length > 0) {
conceptResults.push({
concept: concept,
entities: entitySearch.entities,
entityCount: entitySearch.entities.length
});
}
// Add delay between concept searches to respect rate limits
await this.sleep(100);
}
// Sort by concept confidence if available
if (searchOptions.prioritizeHighConfidence) {
conceptResults.sort((a, b) =>
(b.concept.confidence || 0.5) - (a.concept.confidence || 0.5)
);
}
const totalEntities = conceptResults.reduce((sum, cr) => sum + cr.entityCount, 0);
this.updateSearchStats(totalEntities);
return {
success: true,
conceptMappings: conceptResults,
totalConcepts: concepts.length,
mappedConcepts: conceptResults.length,
totalEntities: totalEntities,
searchType: 'concept'
};
} catch (error) {
logger.error('Concept search failed:', error.message);
return {
success: false,
error: error.message,
totalConcepts: concepts.length,
searchType: 'concept'
};
}
}
/**
* Get instance/subclass hierarchy for entities
* @param {string|Array<string>} entityIds - Wikidata entity IDs
* @param {Object} options - Hierarchy options
* @returns {Promise<Object>} Hierarchical relationships
*/
async getInstanceHierarchy(entityIds, options = {}) {
const hierarchyOptions = {
maxDepth: 3,
includeSubclasses: true,
includeInstances: true,
language: this.options.defaultLanguage,
...options
};
this.stats.searchesByType.hierarchy++;
const idArray = Array.isArray(entityIds) ? entityIds : [entityIds];
try {
const hierarchyResults = [];
for (const entityId of idArray) {
const sparql = this.buildHierarchyQuery(entityId, hierarchyOptions);
const result = await this.connector.executeQuery(sparql, options);
if (result.success) {
const hierarchy = this.processHierarchyResults(result.data, entityId, hierarchyOptions);
hierarchyResults.push(hierarchy);
}
// Rate limiting
await this.sleep(100);
}
return {
success: true,
hierarchies: hierarchyResults,
searchType: 'hierarchy'
};
} catch (error) {
logger.error('Hierarchy search failed:', error.message);
return {
success: false,
error: error.message,
searchType: 'hierarchy'
};
}
}
/**
* Build text search query using wikibase:mwapi
* @private
*/
buildTextSearchQuery(searchTerms, options) {
const searchString = searchTerms.join(' ');
return `
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?score WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" .
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
bd:serviceParam mwapi:search "${searchString.replace(/"/g, '\\"')}" .
bd:serviceParam mwapi:language "${options.language}" .
bd:serviceParam mwapi:limit "${options.limit}" .
?item wikibase:apiOutputItem mwapi:item .
?score wikibase:apiOutput "@score" .
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],${options.language}". }
${options.filterTypes && options.filterTypes.length > 0 ?
`FILTER EXISTS { ?item wdt:P31/wdt:P279* ?type . VALUES ?type { ${options.filterTypes.map(t => `wd:${t}`).join(' ')} } }` :
''
}
}
ORDER BY DESC(?score)`;
}
/**
* Build hierarchy query for P31/P279 relationships
* @private
*/
buildHierarchyQuery(entityId, options) {
return `
SELECT DISTINCT ?related ?relatedLabel ?relationship ?relationshipLabel ?depth WHERE {
{
wd:${entityId} wdt:P31* ?related .
BIND("instanceOf" AS ?relationship)
BIND(1 AS ?depth)
}
${options.includeSubclasses ? `
UNION {
wd:${entityId} wdt:P279* ?related .
BIND("subclassOf" AS ?relationship)
BIND(1 AS ?depth)
}` : ''}
${options.includeInstances ? `
UNION {
?related wdt:P31 wd:${entityId} .
BIND("hasInstance" AS ?relationship)
BIND(1 AS ?depth)
}` : ''}
FILTER(?related != wd:${entityId})
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],${options.language}". }
}
LIMIT 50`;
}
/**
* Process search results and add confidence scores
* @private
*/
processSearchResults(data, options) {
if (!data.results || !data.results.bindings) return [];
return data.results.bindings.map(binding => {
const entity = {
id: this.extractEntityId(binding.item.value),
uri: binding.item.value,
label: binding.itemLabel?.value || 'Unknown',
description: binding.itemDescription?.value || '',
score: binding.score ? parseFloat(binding.score.value) : 0.5,
source: 'wikidata',
searchType: 'text'
};
// Calculate confidence based on score and label match
entity.confidence = this.calculateConfidence(entity, options);
return entity;
}).filter(entity => entity.confidence >= this.options.minConfidence);
}
/**
* Process Wikipedia search results
* @private
*/
processWikipediaResults(data, options) {
if (!data.results || !data.results.bindings) return [];
return data.results.bindings.map(binding => ({
id: this.extractEntityId(binding.item.value),
uri: binding.item.value,
label: binding.itemLabel?.value || 'Unknown',
description: binding.itemDescription?.value || '',
wikipediaTitle: binding.wikipediaTitle?.value || '',
confidence: 0.9, // High confidence for Wikipedia matches
source: 'wikidata',
searchType: 'wikipedia'
}));
}
/**
* Process hierarchy results
* @private
*/
processHierarchyResults(data, rootEntityId, options) {
if (!data.results || !data.results.bindings) return { rootEntity: rootEntityId, relationships: [] };
const relationships = data.results.bindings.map(binding => ({
relatedEntity: {
id: this.extractEntityId(binding.related.value),
uri: binding.related.value,
label: binding.relatedLabel?.value || 'Unknown'
},
relationshipType: binding.relationship?.value || 'related',
depth: parseInt(binding.depth?.value) || 1
}));
return {
rootEntity: rootEntityId,
relationships: relationships,
totalRelationships: relationships.length
};
}
/**
* Calculate confidence score for entity matches
* @private
*/
calculateConfidence(entity, options) {
let confidence = entity.score || 0.5;
// Boost confidence for exact label matches
if (options.searchTerm && entity.label.toLowerCase() === options.searchTerm.toLowerCase()) {
confidence = Math.min(confidence + 0.3, 1.0);
}
// Boost confidence for entities with descriptions
if (entity.description && entity.description.length > 10) {
confidence = Math.min(confidence + 0.1, 1.0);
}
return confidence;
}
/**
* Extract entity ID from Wikidata URI
* @private
*/
extractEntityId(uri) {
const match = uri.match(/\/(Q\d+)$/);
return match ? match[1] : null;
}
/**
* Update search statistics
* @private
*/
updateSearchStats(entityCount) {
this.stats.totalEntitiesFound += entityCount;
const totalSearches = Object.values(this.stats.searchesByType).reduce((a, b) => a + b, 0);
this.stats.averageResultsPerSearch = totalSearches > 0 ?
this.stats.totalEntitiesFound / totalSearches : 0;
}
/**
* Sleep utility function
* @private
*/
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Get search statistics
* @returns {Object} Search usage statistics
*/
getStats() {
return {
...this.stats,
connectorStats: this.connector.getStats()
};
}
/**
* Reset statistics
*/
resetStats() {
this.stats = {
searchesByType: {
text: 0,
concept: 0,
wikipedia: 0,
hierarchy: 0
},
totalEntitiesFound: 0,
averageResultsPerSearch: 0
};
this.connector.resetStats();
}
}