JSDoc: Source: services/document/PDFConverter.js

Source: services/document/PDFConverter.js
import pdf2md from '@opendocsg/pdf2md';
import { readFileSync } from 'fs';
import { v4 as uuidv4 } from 'uuid';
import logger from 'loglevel';

/**
 * PDF to Markdown converter service
 * Converts PDF files to markdown format with metadata extraction
 */
export default class PDFConverter {
  /**
   * Convert PDF file to markdown
   * @param {string} filePath - Path to PDF file
   * @param {Object} options - Conversion options
   * @returns {Promise<Object>} Conversion result with markdown and metadata
   */
  static async convert(filePath, options = {}) {
    if (!filePath || typeof filePath !== 'string') {
      throw new Error('PDFConverter: filePath is required and must be a string');
    }

    try {
      // Validate file exists and is accessible
      const buffer = readFileSync(filePath);
      
      if (buffer.length === 0) {
        throw new Error(`PDFConverter: PDF file is empty: ${filePath}`);
      }

      const startTime = Date.now();
      
      // Convert PDF to markdown
      const result = await pdf2md(buffer, {
        outputDir: options.outputDir || null,
        debug: options.debug || false,
        ...options
      });

      const processingTime = Date.now() - startTime;
      
      if (!result || typeof result !== 'string') {
        throw new Error(`PDFConverter: Failed to extract text from PDF: ${filePath}`);
      }

      const text = result;

      // Extract metadata
      const metadata = {
        sourceFile: filePath,
        fileSize: buffer.length,
        processingTime,
        conversionId: uuidv4(),
        timestamp: new Date().toISOString(),
        format: 'pdf',
        converter: 'pdf2md',
        pages: this.estimatePageCount(text),
        ...options.metadata
      };

      logger.debug(`PDFConverter: Converted ${filePath} (${buffer.length} bytes) in ${processingTime}ms`);

      return {
        markdown: text,
        metadata,
        success: true
      };

    } catch (error) {
      logger.error(`PDFConverter: Error converting ${filePath}:`, error.message);
      throw new Error(`PDFConverter: Failed to convert PDF: ${error.message}`);
    }
  }

  /**
   * Convert PDF buffer to markdown
   * @param {Buffer} buffer - PDF file buffer
   * @param {Object} options - Conversion options
   * @returns {Promise<Object>} Conversion result with markdown and metadata
   */
  static async convertBuffer(buffer, options = {}) {
    if (!Buffer.isBuffer(buffer)) {
      throw new Error('PDFConverter: buffer must be a Buffer instance');
    }

    if (buffer.length === 0) {
      throw new Error('PDFConverter: PDF buffer is empty');
    }

    try {
      const startTime = Date.now();
      
      // Convert PDF buffer to markdown
      const result = await pdf2md(buffer, {
        outputDir: options.outputDir || null,
        debug: options.debug || false,
        ...options
      });

      const processingTime = Date.now() - startTime;
      
      if (!result || typeof result !== 'string') {
        throw new Error('PDFConverter: Failed to extract text from PDF buffer');
      }

      const text = result;

      // Extract metadata
      const metadata = {
        fileSize: buffer.length,
        processingTime,
        conversionId: uuidv4(),
        timestamp: new Date().toISOString(),
        format: 'pdf',
        converter: 'pdf2md',
        pages: this.estimatePageCount(text),
        ...options.metadata
      };

      logger.debug(`PDFConverter: Converted PDF buffer (${buffer.length} bytes) in ${processingTime}ms`);

      return {
        markdown: text,
        metadata,
        success: true
      };

    } catch (error) {
      logger.error('PDFConverter: Error converting PDF buffer:', error.message);
      throw new Error(`PDFConverter: Failed to convert PDF buffer: ${error.message}`);
    }
  }

  /**
   * Validate PDF file
   * @param {string} filePath - Path to PDF file
   * @returns {Promise<Object>} Validation result
   */
  static async validate(filePath) {
    try {
      const buffer = readFileSync(filePath);
      
      // Basic PDF validation - check for PDF header
      const pdfHeader = buffer.slice(0, 4).toString();
      const isValidPDF = pdfHeader === '%PDF';
      
      return {
        valid: isValidPDF,
        fileSize: buffer.length,
        filePath,
        message: isValidPDF ? 'Valid PDF file' : 'Invalid PDF file - missing PDF header'
      };
      
    } catch (error) {
      return {
        valid: false,
        fileSize: 0,
        filePath,
        message: `File validation failed: ${error.message}`
      };
    }
  }

  /**
   * Estimate page count from markdown text
   * @private
   * @param {string} text - Markdown text
   * @returns {number} Estimated page count
   */
  static estimatePageCount(text) {
    if (!text || typeof text !== 'string') return 0;
    
    // Rough estimation: ~500 words per page, ~5 chars per word
    const chars = text.length;
    const estimatedPages = Math.max(1, Math.ceil(chars / 2500));
    
    return estimatedPages;
  }

  /**
   * Get supported file extensions
   * @returns {Array<string>} Supported extensions
   */
  static getSupportedExtensions() {
    return ['.pdf'];
  }

  /**
   * Check if file extension is supported
   * @param {string} filePath - File path to check
   * @returns {boolean} True if supported
   */
  static isSupported(filePath) {
    if (!filePath || typeof filePath !== 'string') return false;
    
    const extension = filePath.toLowerCase().slice(filePath.lastIndexOf('.'));
    return this.getSupportedExtensions().includes(extension);
  }
}