/**
* Wikipedia Search - Fetches search results from Wikipedia API and ingests as RDF
*
* This class provides Wikipedia search functionality and converts results to RDF
* using the Ragno vocabulary. Search results are stored as ragno:Unit instances
* with associated ragno:Entity and ragno:TextElement components.
*/
import fetch from 'node-fetch';
import logger from 'loglevel';
import SPARQLHelper from '../../services/sparql/SPARQLHelper.js';
export default class WikipediaSearch {
/**
* @param {Object} options - Configuration options
*/
constructor(options = {}) {
this.options = {
sparqlEndpoint: options.sparqlEndpoint || 'http://localhost:3030/wikipedia/update',
sparqlAuth: options.sparqlAuth || { user: 'admin', password: 'admin123' },
graphURI: options.graphURI || 'http://purl.org/stuff/wikipedia',
baseURI: options.baseURI || 'http://purl.org/stuff/wikipedia/',
ragnoBaseURI: options.ragnoBaseURI || 'http://purl.org/stuff/ragno/',
wikipediaAPIBase: options.wikipediaAPIBase || 'https://en.wikipedia.org/w/api.php',
timeout: options.timeout || 30000,
defaultSearchLimit: options.defaultSearchLimit || 10,
rateLimit: options.rateLimit || 100,
...options
};
// Initialize RDF namespaces
this.namespaces = {
ragno: 'http://purl.org/stuff/ragno/',
rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
xsd: 'http://www.w3.org/2001/XMLSchema#',
dcterms: 'http://purl.org/dc/terms/',
prov: 'http://www.w3.org/ns/prov#',
wikipedia: this.options.baseURI
};
// Initialize SPARQL helper
this.sparqlHelper = new SPARQLHelper(this.options.sparqlEndpoint, {
auth: this.options.sparqlAuth,
timeout: this.options.timeout,
continueOnError: false
});
// Statistics tracking
this.stats = {
totalQueries: 0,
totalResults: 0,
processedResults: 0,
generatedUnits: 0,
generatedTriples: 0,
errors: [],
startTime: null,
endTime: null
};
}
/**
* Search Wikipedia using the API
*
* @param {string} queryText - Search query text
* @param {Object} options - Search options
* @returns {Promise<Object>} - Search results with query metadata
*/
async search(queryText, options = {}) {
const searchOptions = {
delay: options.delay || this.options.rateLimit || 100, // Use configured rate limiting
limit: options.limit || this.options.defaultSearchLimit || 10, // Use configured search limit
...options
};
try {
logger.info(`Searching Wikipedia for: "${queryText}"`);
// Apply rate limiting if specified
if (searchOptions.delay > 0) {
await new Promise(resolve => setTimeout(resolve, searchOptions.delay));
}
// Construct Wikipedia API URL
const apiUrl = new URL(this.options.wikipediaAPIBase);
apiUrl.searchParams.set('action', 'query');
apiUrl.searchParams.set('list', 'search');
apiUrl.searchParams.set('srsearch', queryText);
apiUrl.searchParams.set('format', 'json');
apiUrl.searchParams.set('srlimit', searchOptions.limit);
apiUrl.searchParams.set('srprop', 'title|snippet|size|timestamp|wordcount');
logger.debug(`Wikipedia API URL: ${apiUrl.toString()}`);
// Execute search request
const response = await fetch(apiUrl.toString(), {
method: 'GET',
headers: {
'User-Agent': 'SememWikipediaSearch/1.0 (https://github.com/danja/semem)',
'Accept': 'application/json'
},
timeout: this.options.timeout
});
if (!response.ok) {
throw new Error(`Wikipedia API request failed: ${response.status} ${response.statusText}`);
}
const data = await response.json();
// Validate response structure
if (!data.query || !data.query.search) {
throw new Error('Invalid Wikipedia API response structure');
}
this.stats.totalQueries++;
this.stats.totalResults += data.query.search.length;
logger.info(`Found ${data.query.search.length} Wikipedia results for "${queryText}"`);
// Return enriched search object
return {
query: queryText,
options: searchOptions,
timestamp: new Date().toISOString(),
results: data.query.search,
totalHits: data.query.searchinfo?.totalhits || data.query.search.length,
apiResponse: data
};
} catch (error) {
logger.error('Wikipedia search failed:', error);
this.stats.errors.push(`Search "${queryText}": ${error.message}`);
throw error;
}
}
/**
* Ingest search results into SPARQL store as RDF
*
* @param {Object} searchObject - Search results from search() method
* @returns {Promise<Object>} - Ingestion results
*/
async ingest(searchObject) {
try {
this.stats.startTime = new Date();
logger.info(`Ingesting ${searchObject.results.length} Wikipedia search results for query: "${searchObject.query}"`);
// Transform search results to RDF units
const units = await this.transformToUnits(searchObject);
logger.info(`Transformed ${units.length} search results to RDF units`);
// Load units to SPARQL store
const loadResults = await this.loadUnitsToSPARQL(units);
logger.info(`Loaded ${loadResults.successful}/${loadResults.total} units to SPARQL store`);
this.stats.endTime = new Date();
this.stats.processingTime = this.stats.endTime - this.stats.startTime;
return {
success: true,
query: searchObject.query,
statistics: this.getStatistics(),
loadResults: loadResults
};
} catch (error) {
logger.error('Wikipedia ingestion failed:', error);
this.stats.errors.push(`Ingestion: ${error.message}`);
this.stats.endTime = new Date();
return {
success: false,
error: error.message,
query: searchObject.query,
statistics: this.getStatistics()
};
}
}
/**
* Transform Wikipedia search results to RDF units
*
* @param {Object} searchObject - Search results object
* @returns {Promise<Array>} - Array of RDF unit data
*/
async transformToUnits(searchObject) {
const units = [];
for (let i = 0; i < searchObject.results.length; i++) {
try {
const result = searchObject.results[i];
const unit = await this.createUnit(result, searchObject, i);
units.push(unit);
this.stats.processedResults++;
this.stats.generatedUnits++;
if ((i + 1) % 10 === 0) {
logger.info(`Processed ${i + 1}/${searchObject.results.length} Wikipedia results`);
}
} catch (error) {
logger.warn(`Failed to transform Wikipedia result ${i}:`, error.message);
this.stats.errors.push(`Result ${i}: ${error.message}`);
}
}
logger.info(`Successfully transformed ${units.length} Wikipedia results to units`);
return units;
}
/**
* Create RDF unit from Wikipedia search result
*
* @param {Object} result - Wikipedia search result
* @param {Object} searchObject - Original search object
* @param {number} index - Result index
* @returns {Object} - RDF unit data
*/
async createUnit(result, searchObject, index) {
// Generate URIs
const unitURI = `${this.options.baseURI}unit/${this.generateResultId(result)}`;
const entityURI = `${this.options.baseURI}entity/${this.generateResultId(result)}`;
const textElementURI = `${this.options.baseURI}text/${this.generateResultId(result)}`;
const wikipediaPageURI = `https://en.wikipedia.org/wiki/${encodeURIComponent(result.title.replace(/ /g, '_'))}`;
// Create unit structure
const unit = {
uri: unitURI,
type: 'wikipedia-search-result',
title: result.title,
snippet: this.cleanSnippet(result.snippet),
metadata: {
query: searchObject.query,
searchTimestamp: searchObject.timestamp,
wikipediaPageId: result.pageid,
wikipediaTitle: result.title,
wikipediaPageURI: wikipediaPageURI,
entityURI: entityURI,
textElementURI: textElementURI,
size: result.size || 0,
wordcount: result.wordcount || 0,
lastModified: result.timestamp,
resultIndex: index,
namespace: result.ns || 0
},
triples: []
};
// Generate RDF triples for the unit
unit.triples = this.generateUnitTriples(unit, result, searchObject);
this.stats.generatedTriples += unit.triples.length;
return unit;
}
/**
* Generate RDF triples for a Wikipedia unit
*
* @param {Object} unit - Unit data
* @param {Object} result - Original Wikipedia result
* @param {Object} searchObject - Original search object
* @returns {Array} - Array of RDF triple strings
*/
generateUnitTriples(unit, result, searchObject) {
const triples = [];
const unitURI = `<${unit.uri}>`;
const entityURI = `<${unit.metadata.entityURI}>`;
const textElementURI = `<${unit.metadata.textElementURI}>`;
const wikipediaPageURI = `<${unit.metadata.wikipediaPageURI}>`;
// Core unit properties
triples.push(`${unitURI} rdf:type ragno:Unit .`);
triples.push(`${unitURI} rdfs:label ${SPARQLHelper.createLiteral(unit.title)} .`);
triples.push(`${unitURI} ragno:unitType ${SPARQLHelper.createLiteral(unit.type)} .`);
// Metadata properties
triples.push(`${unitURI} dcterms:identifier ${SPARQLHelper.createLiteral(unit.metadata.wikipediaPageId.toString())} .`);
triples.push(`${unitURI} dcterms:source ${SPARQLHelper.createLiteral('wikipedia-search')} .`);
triples.push(`${unitURI} dcterms:created ${SPARQLHelper.createLiteral(searchObject.timestamp, 'http://www.w3.org/2001/XMLSchema#dateTime')} .`);
triples.push(`${unitURI} dcterms:modified ${SPARQLHelper.createLiteral(result.timestamp, 'http://www.w3.org/2001/XMLSchema#dateTime')} .`);
// Search provenance
triples.push(`${unitURI} prov:wasGeneratedBy ${SPARQLHelper.createLiteral('wikipedia-search')} .`);
triples.push(`${unitURI} ragno:searchQuery ${SPARQLHelper.createLiteral(searchObject.query)} .`);
triples.push(`${unitURI} ragno:resultIndex ${SPARQLHelper.createLiteral(unit.metadata.resultIndex.toString(), 'http://www.w3.org/2001/XMLSchema#integer')} .`);
triples.push(`${unitURI} ragno:namespace ${SPARQLHelper.createLiteral(unit.metadata.namespace.toString(), 'http://www.w3.org/2001/XMLSchema#integer')} .`);
// Entity (Wikipedia page) properties
triples.push(`${entityURI} rdf:type ragno:Entity .`);
triples.push(`${entityURI} rdfs:label ${SPARQLHelper.createLiteral(unit.title)} .`);
triples.push(`${entityURI} ragno:entityType ${SPARQLHelper.createLiteral('wikipedia-page')} .`);
triples.push(`${entityURI} dcterms:identifier ${SPARQLHelper.createLiteral(unit.metadata.wikipediaPageId.toString())} .`);
triples.push(`${entityURI} ragno:pageSize ${SPARQLHelper.createLiteral(unit.metadata.size.toString(), 'http://www.w3.org/2001/XMLSchema#integer')} .`);
triples.push(`${entityURI} ragno:wordCount ${SPARQLHelper.createLiteral(unit.metadata.wordcount.toString(), 'http://www.w3.org/2001/XMLSchema#integer')} .`);
triples.push(`${entityURI} ragno:wikipediaURI ${wikipediaPageURI} .`);
// Unit-Entity relationship
triples.push(`${unitURI} ragno:hasEntity ${entityURI} .`);
triples.push(`${entityURI} ragno:belongsToUnit ${unitURI} .`);
// TextElement (snippet) properties
triples.push(`${textElementURI} rdf:type ragno:TextElement .`);
triples.push(`${textElementURI} rdfs:label ${SPARQLHelper.createLiteral('Wikipedia Search Snippet')} .`);
triples.push(`${textElementURI} ragno:content ${SPARQLHelper.createLiteral(unit.snippet)} .`);
triples.push(`${textElementURI} ragno:textType ${SPARQLHelper.createLiteral('search-snippet')} .`);
triples.push(`${textElementURI} ragno:contentLength ${SPARQLHelper.createLiteral(unit.snippet.length.toString(), 'http://www.w3.org/2001/XMLSchema#integer')} .`);
// TextElement provenance
triples.push(`${textElementURI} prov:wasDerivedFrom ${wikipediaPageURI} .`);
triples.push(`${textElementURI} prov:wasGeneratedBy ${SPARQLHelper.createLiteral('wikipedia-search-api')} .`);
// Entity-TextElement relationship
triples.push(`${entityURI} ragno:hasTextElement ${textElementURI} .`);
triples.push(`${textElementURI} ragno:describesEntity ${entityURI} .`);
return triples;
}
/**
* Load units to SPARQL store
*
* @param {Array} units - Array of unit data
* @returns {Promise<Object>} - Load operation results
*/
async loadUnitsToSPARQL(units) {
logger.info(`Loading ${units.length} Wikipedia units to SPARQL store`);
const results = [];
for (let i = 0; i < units.length; i++) {
const unit = units[i];
logger.debug(`Loading unit ${i + 1}/${units.length}: ${unit.title}`);
try {
const unitTriples = unit.triples.join('\n ');
const query = this.sparqlHelper.createInsertDataQuery(this.options.graphURI, unitTriples);
const result = await this.sparqlHelper.executeUpdate(query);
results.push(result);
if (!result.success) {
logger.error(`Unit ${i + 1} (${unit.title}) failed:`, result.error);
this.stats.errors.push(`Unit "${unit.title}": ${result.error}`);
}
} catch (error) {
logger.error(`Failed to load unit ${i + 1} (${unit.title}):`, error);
this.stats.errors.push(`Unit "${unit.title}": ${error.message}`);
results.push({ success: false, error: error.message });
}
}
return SPARQLHelper.getExecutionStats(results);
}
/**
* Clean Wikipedia snippet text
*
* @param {string} snippet - Raw snippet from Wikipedia
* @returns {string} - Cleaned snippet text
*/
cleanSnippet(snippet) {
if (!snippet) return '';
return snippet
.replace(/<[^>]*>/g, '') // Remove HTML tags
.replace(/"/g, '"') // Convert HTML entities
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
/**
* Generate unique ID for Wikipedia result
*
* @param {Object} result - Wikipedia search result
* @returns {string} - Unique result ID
*/
generateResultId(result) {
// Use Wikipedia page ID as unique identifier
return `wp_${result.pageid}`;
}
/**
* Get processing statistics
*
* @returns {Object} - Current statistics
*/
getStatistics() {
return {
...this.stats,
processingTimeMs: this.stats.endTime ? this.stats.endTime - this.stats.startTime : null,
successRate: this.stats.totalResults > 0 ? (this.stats.processedResults / this.stats.totalResults) * 100 : 0,
avgTriplesPerUnit: this.stats.generatedUnits > 0 ? this.stats.generatedTriples / this.stats.generatedUnits : 0
};
}
/**
* Query loaded Wikipedia units for verification
*
* @param {number} limit - Maximum number of results to return
* @returns {Promise<Object>} - Query results
*/
async queryUnits(limit = 10) {
const queryEndpoint = this.options.sparqlEndpoint.replace('/update', '/query');
const query = `
PREFIX ragno: <http://purl.org/stuff/ragno/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?unit ?label ?query ?namespace ?entity ?textElement
FROM <${this.options.graphURI}>
WHERE {
?unit a ragno:Unit ;
rdfs:label ?label ;
ragno:searchQuery ?query ;
ragno:namespace ?namespace ;
ragno:hasEntity ?entity .
?entity ragno:hasTextElement ?textElement .
}
ORDER BY ?unit
LIMIT ${limit}`;
try {
const response = await fetch(queryEndpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/sparql-query',
'Accept': 'application/sparql-results+json',
...(this.options.sparqlAuth ? {
'Authorization': `Basic ${btoa(`${this.options.sparqlAuth.user}:${this.options.sparqlAuth.password}`)}`
} : {})
},
body: query
});
if (!response.ok) {
throw new Error(`Query failed: ${response.status} ${response.statusText}`);
}
const results = await response.json();
return results;
} catch (error) {
logger.error('Failed to query Wikipedia units:', error);
throw error;
}
}
/**
* Generate summary report
*
* @returns {Object} - Summary report
*/
generateReport() {
const stats = this.getStatistics();
return {
summary: {
totalQueries: stats.totalQueries,
totalResults: stats.totalResults,
processedResults: stats.processedResults,
generatedUnits: stats.generatedUnits,
generatedTriples: stats.generatedTriples,
successRate: `${stats.successRate.toFixed(2)}%`,
processingTime: stats.processingTimeMs ? `${(stats.processingTimeMs / 1000).toFixed(2)}s` : 'N/A',
avgTriplesPerUnit: stats.avgTriplesPerUnit.toFixed(2)
},
configuration: {
graphURI: this.options.graphURI,
sparqlEndpoint: this.options.sparqlEndpoint,
wikipediaAPIBase: this.options.wikipediaAPIBase
},
errors: stats.errors.length > 0 ? stats.errors.slice(0, 10) : [],
timestamp: new Date().toISOString()
};
}
}