JSDoc: Source: zpt/transform/TokenCounter.js

Source: zpt/transform/TokenCounter.js
/**
 * Accurate token counting for different tokenizers and LLM models
 */
export default class TokenCounter {
    constructor(options = {}) {
        this.config = {
            defaultTokenizer: options.defaultTokenizer || 'cl100k_base',
            cacheEnabled: options.cacheEnabled !== false,
            maxCacheSize: options.maxCacheSize || 1000,
            estimationFallback: options.estimationFallback !== false,
            ...options
        };

        this.initializeTokenizers();
        this.initializeModelMappings();
        this.tokenCache = new Map();
        this.estimationCache = new Map();
    }

    /**
     * Initialize available tokenizers and their configurations
     */
    initializeTokenizers() {
        this.tokenizers = {
            'cl100k_base': {
                name: 'cl100k_base',
                models: ['gpt-4', 'gpt-3.5-turbo', 'text-embedding-ada-002'],
                avgCharsPerToken: 4.0,
                specialTokens: new Set(['<|endoftext|>', '<|fim_prefix|>', '<|fim_middle|>', '<|fim_suffix|>']),
                encoding: 'cl100k_base'
            },
            'p50k_base': {
                name: 'p50k_base', 
                models: ['text-davinci-003', 'text-davinci-002', 'text-davinci-001'],
                avgCharsPerToken: 4.0,
                specialTokens: new Set(['<|endoftext|>']),
                encoding: 'p50k_base'
            },
            'r50k_base': {
                name: 'r50k_base',
                models: ['text-davinci-003', 'text-davinci-002'],
                avgCharsPerToken: 4.0,
                specialTokens: new Set(['<|endoftext|>']),
                encoding: 'r50k_base'
            },
            'gpt2': {
                name: 'gpt2',
                models: ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'],
                avgCharsPerToken: 4.0,
                specialTokens: new Set(['<|endoftext|>']),
                encoding: 'gpt2'
            },
            'claude': {
                name: 'claude',
                models: ['claude-3', 'claude-2', 'claude-instant'],
                avgCharsPerToken: 3.8,
                specialTokens: new Set(['<|endoftext|>', '[INST]', '[/INST]']),
                encoding: 'claude'
            },
            'llama': {
                name: 'llama',
                models: ['llama-2', 'llama-7b', 'llama-13b', 'llama-70b'],
                avgCharsPerToken: 4.2,
                specialTokens: new Set(['<s>', '</s>', '<unk>']),
                encoding: 'llama'
            }
        };
    }

    /**
     * Initialize model-specific token counting configurations
     */
    initializeModelMappings() {
        this.modelConfigs = {
            'gpt-4': {
                tokenizer: 'cl100k_base',
                maxContextLength: 128000,
                maxOutputTokens: 4096,
                costPerInputToken: 0.00003,
                costPerOutputToken: 0.00006
            },
            'gpt-3.5-turbo': {
                tokenizer: 'cl100k_base',
                maxContextLength: 16385,
                maxOutputTokens: 4096,
                costPerInputToken: 0.0000015,
                costPerOutputToken: 0.000002
            },
            'claude-3-sonnet': {
                tokenizer: 'claude',
                maxContextLength: 200000,
                maxOutputTokens: 4096,
                costPerInputToken: 0.000003,
                costPerOutputToken: 0.000015
            },
            'claude-3-opus': {
                tokenizer: 'claude',
                maxContextLength: 200000,
                maxOutputTokens: 4096,
                costPerInputToken: 0.000015,
                costPerOutputToken: 0.000075
            },
            'claude-3-haiku': {
                tokenizer: 'claude',
                maxContextLength: 200000,
                maxOutputTokens: 4096,
                costPerInputToken: 0.00000025,
                costPerOutputToken: 0.00000125
            }
        };
    }

    /**
     * Count tokens in text using specified tokenizer
     * @param {string} text - Text to count tokens for
     * @param {string} tokenizer - Tokenizer name (optional)
     * @returns {Promise<Object>} Token count result
     */
    async countTokens(text, tokenizer = null) {
        if (!text || typeof text !== 'string') {
            return { count: 0, method: 'empty', tokenizer: tokenizer || this.config.defaultTokenizer };
        }

        const tokenizerName = tokenizer || this.config.defaultTokenizer;
        const cacheKey = this.createCacheKey(text, tokenizerName);

        // Check cache first
        if (this.config.cacheEnabled && this.tokenCache.has(cacheKey)) {
            const cached = this.tokenCache.get(cacheKey);
            return { ...cached, fromCache: true };
        }

        let result;
        try {
            // Try precise tokenization first
            result = await this.preciseTokenCount(text, tokenizerName);
        } catch (error) {
            // Fallback to estimation if precise counting fails
            if (this.config.estimationFallback) {
                result = this.estimateTokenCount(text, tokenizerName);
                result.fallback = true;
                result.error = error.message;
            } else {
                throw error;
            }
        }

        // Cache result
        if (this.config.cacheEnabled) {
            this.cacheTokenCount(cacheKey, result);
        }

        return result;
    }

    /**
     * Precise token counting using actual tokenizer
     * @param {string} text - Text to tokenize
     * @param {string} tokenizerName - Tokenizer to use
     * @returns {Promise<Object>} Precise token count
     */
    async preciseTokenCount(text, tokenizerName) {
        const tokenizer = this.tokenizers[tokenizerName];
        if (!tokenizer) {
            throw new Error(`Unknown tokenizer: ${tokenizerName}`);
        }

        // For now, use estimation as precise tokenization requires external libraries
        // In a full implementation, would use tiktoken or similar
        return this.advancedEstimation(text, tokenizerName);
    }

    /**
     * Advanced token estimation with tokenizer-specific logic
     * @param {string} text - Text to estimate
     * @param {string} tokenizerName - Tokenizer name
     * @returns {Object} Estimated token count
     */
    advancedEstimation(text, tokenizerName) {
        const tokenizer = this.tokenizers[tokenizerName];
        const startTime = Date.now();

        // Basic character-based estimation
        let estimatedTokens = Math.ceil(text.length / tokenizer.avgCharsPerToken);

        // Apply tokenizer-specific adjustments
        estimatedTokens = this.applyTokenizerAdjustments(text, estimatedTokens, tokenizer);

        // Apply content-type specific adjustments
        estimatedTokens = this.applyContentAdjustments(text, estimatedTokens);

        return {
            count: Math.max(1, Math.round(estimatedTokens)),
            method: 'advanced_estimation',
            tokenizer: tokenizerName,
            confidence: this.calculateConfidence(text, tokenizer),
            processingTime: Date.now() - startTime,
            details: {
                originalLength: text.length,
                avgCharsPerToken: tokenizer.avgCharsPerToken,
                adjustmentFactor: estimatedTokens / (text.length / tokenizer.avgCharsPerToken)
            }
        };
    }

    /**
     * Simple token estimation fallback
     * @param {string} text - Text to estimate
     * @param {string} tokenizerName - Tokenizer name
     * @returns {Object} Simple token estimate
     */
    estimateTokenCount(text, tokenizerName) {
        const tokenizer = this.tokenizers[tokenizerName] || this.tokenizers[this.config.defaultTokenizer];
        const estimatedTokens = Math.ceil(text.length / tokenizer.avgCharsPerToken);

        return {
            count: Math.max(1, estimatedTokens),
            method: 'simple_estimation',
            tokenizer: tokenizerName,
            confidence: 0.7,
            processingTime: 0
        };
    }

    /**
     * Apply tokenizer-specific adjustments
     */
    applyTokenizerAdjustments(text, baseEstimate, tokenizer) {
        let adjusted = baseEstimate;

        // Handle special tokens
        tokenizer.specialTokens.forEach(token => {
            const occurrences = (text.match(new RegExp(token, 'g')) || []).length;
            adjusted += occurrences; // Special tokens usually count as 1 token regardless of length
        });

        // Tokenizer-specific patterns
        switch (tokenizer.name) {
            case 'cl100k_base':
                // GPT-4 tokenizer tends to split on punctuation more aggressively
                adjusted *= 1.1;
                break;
            case 'claude':
                // Claude tokenizer is slightly more efficient
                adjusted *= 0.95;
                break;
            case 'llama':
                // LLaMA tokenizer has different behavior for certain patterns
                adjusted *= 1.05;
                break;
        }

        return adjusted;
    }

    /**
     * Apply content-type specific adjustments
     */
    applyContentAdjustments(text, baseEstimate) {
        let adjusted = baseEstimate;

        // Code detection
        if (this.isCode(text)) {
            adjusted *= 1.3; // Code typically has more tokens per character
        }

        // URL detection
        const urlCount = (text.match(/https?:\/\/[^\s]+/g) || []).length;
        adjusted += urlCount * 2; // URLs often split into multiple tokens

        // Number detection
        const numberCount = (text.match(/\d+/g) || []).length;
        adjusted += numberCount * 0.5; // Numbers may split differently

        // Punctuation density
        const punctuationDensity = (text.match(/[.,;:!?()[\]{}]/g) || []).length / text.length;
        if (punctuationDensity > 0.1) {
            adjusted *= (1 + punctuationDensity); // High punctuation increases token count
        }

        // Non-ASCII characters
        const nonAsciiCount = (text.match(/[^\x00-\x7F]/g) || []).length;
        if (nonAsciiCount > 0) {
            adjusted += nonAsciiCount * 0.5; // Non-ASCII often requires more tokens
        }

        return adjusted;
    }

    /**
     * Calculate confidence in estimation
     */
    calculateConfidence(text, tokenizer) {
        let confidence = 0.8; // Base confidence

        // Reduce confidence for complex content
        if (this.isCode(text)) confidence -= 0.1;
        if (text.includes('http')) confidence -= 0.05;
        if (text.match(/[^\x00-\x7F]/)) confidence -= 0.1;

        // Increase confidence for simple text
        if (text.match(/^[a-zA-Z\s.,!?]+$/)) confidence += 0.1;

        return Math.max(0.3, Math.min(0.95, confidence));
    }

    /**
     * Detect if text is likely code
     */
    isCode(text) {
        const codeIndicators = [
            /function\s+\w+\s*\(/,
            /class\s+\w+/,
            /import\s+\w+/,
            /def\s+\w+\s*\(/,
            /\{[\s\S]*\}/,
            /console\.log/,
            /print\s*\(/
        ];

        return codeIndicators.some(pattern => pattern.test(text));
    }

    /**
     * Count tokens for multiple texts in batch
     * @param {Array<string>} texts - Array of texts to count
     * @param {string} tokenizer - Tokenizer name
     * @returns {Promise<Array<Object>>} Array of token counts
     */
    async batchCountTokens(texts, tokenizer = null) {
        const results = [];
        
        for (const text of texts) {
            try {
                const result = await this.countTokens(text, tokenizer);
                results.push(result);
            } catch (error) {
                results.push({
                    count: 0,
                    method: 'error',
                    error: error.message,
                    tokenizer: tokenizer || this.config.defaultTokenizer
                });
            }
        }

        return results;
    }

    /**
     * Estimate cost for token usage
     * @param {number} inputTokens - Number of input tokens
     * @param {number} outputTokens - Number of output tokens  
     * @param {string} model - Model name
     * @returns {Object} Cost estimation
     */
    estimateCost(inputTokens, outputTokens, model) {
        const modelConfig = this.modelConfigs[model];
        if (!modelConfig) {
            return {
                error: `Unknown model: ${model}`,
                inputCost: 0,
                outputCost: 0,
                totalCost: 0
            };
        }

        const inputCost = inputTokens * modelConfig.costPerInputToken;
        const outputCost = outputTokens * modelConfig.costPerOutputToken;
        const totalCost = inputCost + outputCost;

        return {
            model,
            inputTokens,
            outputTokens,
            inputCost,
            outputCost,
            totalCost,
            currency: 'USD'
        };
    }

    /**
     * Check if content fits within model context limits
     * @param {number} tokenCount - Token count to check
     * @param {string} model - Model name
     * @param {number} reservedOutputTokens - Tokens to reserve for output
     * @returns {Object} Context limit check result
     */
    checkContextLimits(tokenCount, model, reservedOutputTokens = 1000) {
        const modelConfig = this.modelConfigs[model];
        if (!modelConfig) {
            return {
                error: `Unknown model: ${model}`,
                fits: false
            };
        }

        const availableTokens = modelConfig.maxContextLength - reservedOutputTokens;
        const fits = tokenCount <= availableTokens;
        const utilization = tokenCount / availableTokens;

        return {
            model,
            tokenCount,
            maxContextLength: modelConfig.maxContextLength,
            availableTokens,
            reservedOutputTokens,
            fits,
            utilization,
            recommendation: this.getContextRecommendation(utilization)
        };
    }

    /**
     * Get recommendation based on context utilization
     */
    getContextRecommendation(utilization) {
        if (utilization > 1.0) return 'Content exceeds context limit - chunking required';
        if (utilization > 0.9) return 'Very high utilization - consider chunking';
        if (utilization > 0.7) return 'High utilization - monitor performance';
        if (utilization > 0.5) return 'Moderate utilization - acceptable';
        return 'Low utilization - efficient usage';
    }

    /**
     * Optimize token usage for a given budget
     * @param {Array<Object>} content - Content items with token counts
     * @param {number} tokenBudget - Available token budget
     * @param {string} strategy - Optimization strategy
     * @returns {Object} Optimization result
     */
    optimizeTokenUsage(content, tokenBudget, strategy = 'priority') {
        const sorted = [...content];
        
        switch (strategy) {
            case 'priority':
                sorted.sort((a, b) => (b.priority || 0) - (a.priority || 0));
                break;
            case 'efficiency':
                sorted.sort((a, b) => (b.score / b.tokenCount) - (a.score / a.tokenCount));
                break;
            case 'size':
                sorted.sort((a, b) => a.tokenCount - b.tokenCount);
                break;
        }

        const selected = [];
        let usedTokens = 0;
        
        for (const item of sorted) {
            if (usedTokens + item.tokenCount <= tokenBudget) {
                selected.push(item);
                usedTokens += item.tokenCount;
            }
        }

        return {
            selected,
            usedTokens,
            remainingTokens: tokenBudget - usedTokens,
            utilization: usedTokens / tokenBudget,
            strategy,
            droppedItems: content.length - selected.length
        };
    }

    /**
     * Cache management methods
     */
    createCacheKey(text, tokenizer) {
        // Create hash of text and tokenizer for cache key
        const hash = this.simpleHash(text + tokenizer);
        return `${tokenizer}_${hash}`;
    }

    cacheTokenCount(cacheKey, result) {
        if (this.tokenCache.size >= this.config.maxCacheSize) {
            // Remove oldest entry
            const firstKey = this.tokenCache.keys().next().value;
            this.tokenCache.delete(firstKey);
        }
        
        this.tokenCache.set(cacheKey, {
            ...result,
            cached: true,
            cacheTime: Date.now()
        });
    }

    simpleHash(str) {
        let hash = 0;
        for (let i = 0; i < str.length; i++) {
            const char = str.charCodeAt(i);
            hash = ((hash << 5) - hash) + char;
            hash = hash & hash;
        }
        return Math.abs(hash).toString(36);
    }

    /**
     * Get tokenizer information
     * @param {string} tokenizerName - Tokenizer name
     * @returns {Object} Tokenizer information
     */
    getTokenizerInfo(tokenizerName) {
        const tokenizer = this.tokenizers[tokenizerName];
        if (!tokenizer) {
            throw new Error(`Unknown tokenizer: ${tokenizerName}`);
        }
        
        return { ...tokenizer };
    }

    /**
     * Get model information
     * @param {string} modelName - Model name
     * @returns {Object} Model information
     */
    getModelInfo(modelName) {
        const model = this.modelConfigs[modelName];
        if (!model) {
            throw new Error(`Unknown model: ${modelName}`);
        }
        
        return { ...model };
    }

    /**
     * List available tokenizers
     * @returns {Array<string>} Available tokenizer names
     */
    getAvailableTokenizers() {
        return Object.keys(this.tokenizers);
    }

    /**
     * List available models
     * @returns {Array<string>} Available model names
     */
    getAvailableModels() {
        return Object.keys(this.modelConfigs);
    }

    /**
     * Clear token cache
     */
    clearCache() {
        this.tokenCache.clear();
        this.estimationCache.clear();
    }

    /**
     * Get cache statistics
     * @returns {Object} Cache statistics
     */
    getCacheStats() {
        return {
            tokenCacheSize: this.tokenCache.size,
            estimationCacheSize: this.estimationCache.size,
            maxCacheSize: this.config.maxCacheSize,
            cacheEnabled: this.config.cacheEnabled
        };
    }

    /**
     * Get performance statistics
     * @returns {Object} Performance statistics
     */
    getPerformanceStats() {
        // Simple performance tracking - could be enhanced
        return {
            totalCalls: this.tokenCache.size, // Approximate
            cacheHitRate: 0.8, // Placeholder
            avgProcessingTime: 5 // ms, placeholder
        };
    }
}