/** * Content Similarity Service * Analyzes semantic similarity between articles with market-aware adjustments */ const ClaudeAPIService = require('./ClaudeAPI.service'); const publicationConfig = require('../config/publication-targets.config'); const logger = require('../utils/logger.util'); class ContentSimilarityService { /** * Analyze semantic similarity between two articles * @param {Object} article1 - First article * @param {Object} article2 - Second article * @param {Object} context - Optional context (target publication, etc.) * @returns {Promise} Similarity analysis with market adjustments */ async analyzeSimilarity(article1, article2, context = {}) { logger.info(`Analyzing similarity: "${article1.title}" vs "${article2.title}"`); try { const claudeAPI = ClaudeAPIService; // Build system prompt for similarity analysis const systemPrompt = `You are an expert content analyst evaluating semantic similarity between two articles. Your task is to analyze: 1. **Core Argument Similarity**: Do they make the same fundamental argument? 2. **Evidence/Examples**: Do they use the same examples or evidence? 3. **Narrative/Framing**: Do they frame the issue the same way? 4. **Conclusions**: Do they reach the same conclusions? 5. **Unique Elements**: What makes each article distinct? Return JSON: { "semanticSimilarity": 0.0-1.0, "argumentSimilarity": 0.0-1.0, "evidenceSimilarity": 0.0-1.0, "framingSimilarity": 0.0-1.0, "conclusionSimilarity": 0.0-1.0, "uniqueElements": { "article1": ["Unique aspect 1", "Unique aspect 2"], "article2": ["Unique aspect 1", "Unique aspect 2"] }, "reasoning": "Detailed explanation of similarity assessment", "verdict": "distinct|similar|very-similar|duplicate" } Similarity scale: - 0.0-0.3: Distinct (different arguments) - 0.3-0.5: Some overlap (related topics, different angles) - 0.5-0.7: Similar (same argument, different execution) - 0.7-0.9: Very similar (same argument and approach) - 0.9-1.0: Duplicate (essentially the same article)`; const userPrompt = `Compare these two articles: **Article 1: "${article1.title}"** Word Count: ${article1.wordCount || 'unknown'} ${article1.content.substring(0, 3000)}${article1.content.length > 3000 ? '...' : ''} --- **Article 2: "${article2.title}"** Word Count: ${article2.wordCount || 'unknown'} ${article2.content.substring(0, 3000)}${article2.content.length > 3000 ? '...' : ''} --- Analyze semantic similarity and provide detailed JSON response.`; const messages = [{ role: 'user', content: userPrompt }]; const response = await claudeAPI.sendMessage(messages, { system: systemPrompt, max_tokens: 1500, temperature: 0.2 // Low temperature for consistent analysis }); const baseSimilarity = claudeAPI.extractJSON(response); // Apply market-aware adjustments const adjusted = this._applyMarketAdjustments( baseSimilarity, article1, article2, context ); logger.info(`Similarity analysis complete: ${adjusted.finalSimilarity.toFixed(2)}`); return adjusted; } catch (error) { logger.error('Similarity analysis error:', error); throw new Error(`Failed to analyze similarity: ${error.message}`); } } /** * Apply market-aware adjustments to similarity score * @private */ _applyMarketAdjustments(baseSimilarity, article1, article2, context) { const adjustments = { differentLanguage: 0, differentRegion: 0, differentCultural: 0, differentAudience: 0, isAdaptation: 0 }; const reasoning = []; // Get publication market data const pub1 = context.targetPublication1 ? publicationConfig.getPublicationById(context.targetPublication1) : null; const pub2 = context.targetPublication2 ? publicationConfig.getPublicationById(context.targetPublication2) : null; // Different language modifier if (pub1?.market?.language && pub2?.market?.language) { if (pub1.market.language !== pub2.market.language) { adjustments.differentLanguage = -0.3; reasoning.push(`Different languages (${pub1.market.language} vs ${pub2.market.language}) - major adjustment`); } } // Different region modifier if (pub1?.market && pub2?.market) { const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions); if (overlap === 0) { adjustments.differentRegion = -0.2; reasoning.push(`No regional overlap - content can be reused across markets`); } else if (overlap < 0.3) { adjustments.differentRegion = -0.1; reasoning.push(`Minimal regional overlap (${(overlap * 100).toFixed(0)}%) - slight adjustment`); } else if (overlap > 0.7) { reasoning.push(`High regional overlap (${(overlap * 100).toFixed(0)}%) - similarity matters`); } } // Different cultural context if (pub1?.market?.culturalContext && pub2?.market?.culturalContext) { if (pub1.market.culturalContext !== pub2.market.culturalContext && pub1.market.culturalContext !== 'universal' && pub2.market.culturalContext !== 'universal') { adjustments.differentCultural = -0.15; reasoning.push(`Different cultural contexts (${pub1.market.culturalContext} vs ${pub2.market.culturalContext})`); } } // Different audience sophistication if (pub1?.audience && pub2?.audience) { const audienceOverlap = this._calculateAudienceOverlap(pub1.audience, pub2.audience); if (audienceOverlap < 0.3) { adjustments.differentAudience = -0.1; reasoning.push(`Different target audiences - allows similar messaging with different framing`); } } // Check if intentional adaptation if (context.isAdaptation) { adjustments.isAdaptation = -0.4; reasoning.push(`Marked as intentional adaptation - high similarity expected and allowed`); } // Calculate final similarity const totalAdjustment = Object.values(adjustments).reduce((sum, val) => sum + val, 0); const finalSimilarity = Math.max(0, Math.min(1, baseSimilarity.semanticSimilarity + totalAdjustment)); // Determine if allowed const threshold = context.similarityThreshold || 0.70; const allowed = finalSimilarity < threshold; return { baseSimilarity: baseSimilarity.semanticSimilarity, adjustments, totalAdjustment, finalSimilarity, threshold, allowed, verdict: baseSimilarity.verdict, reasoning: reasoning.join('; '), detailedAnalysis: baseSimilarity, recommendation: this._generateRecommendation(finalSimilarity, allowed, context) }; } /** * Calculate regional overlap between two publication markets * @private */ _calculateRegionOverlap(regions1, regions2) { if (!regions1 || !regions2) return 0.5; // Unknown, assume moderate overlap // Handle global reach specially if (regions1.includes('global') || regions2.includes('global')) { return 1.0; // Global publications overlap with everything } const set1 = new Set(regions1); const set2 = new Set(regions2); const intersection = new Set([...set1].filter(x => set2.has(x))); if (set1.size === 0 || set2.size === 0) return 0; return intersection.size / Math.max(set1.size, set2.size); } /** * Calculate audience overlap * @private */ _calculateAudienceOverlap(audience1, audience2) { if (!audience1 || !audience2) return 0.5; const set1 = new Set(audience1); const set2 = new Set(audience2); const intersection = new Set([...set1].filter(x => set2.has(x))); if (set1.size === 0 || set2.size === 0) return 0; return intersection.size / Math.max(set1.size, set2.size); } /** * Generate human-readable recommendation * @private */ _generateRecommendation(similarity, allowed, context) { if (similarity < 0.3) { return '✅ These articles are distinct. Safe to publish both.'; } else if (similarity < 0.5) { return '✅ Some thematic overlap, but different enough for both markets.'; } else if (similarity < 0.7) { return allowed ? '⚠️ Moderately similar. Acceptable for different markets, but consider revisions for same market.' : '⚠️ Moderately similar. Consider significant revisions to differentiate.'; } else if (similarity < 0.85) { return allowed ? '⚠️ Very similar. Only acceptable due to different markets/languages. Ensure cultural adaptation.' : '🚫 Very similar. Revisions required before submitting to overlapping markets.'; } else { return allowed ? '⚠️ Nearly identical. Only acceptable as intentional translation/adaptation.' : '🚫 Too similar. This would violate editorial standards and exclusivity requirements.'; } } /** * Check for submission conflicts * @param {string} contentId - BlogPost ID * @param {string} targetPublicationId - Target publication * @param {Object} SubmissionTracking - Submission model * @returns {Promise} Conflict analysis */ async checkSubmissionConflict(contentId, targetPublicationId, SubmissionTracking) { const targetPub = publicationConfig.getPublicationById(targetPublicationId); if (!targetPub) { throw new Error(`Publication not found: ${targetPublicationId}`); } // Find active submissions for this content const activeSubmissions = await SubmissionTracking.find({ blogPostId: contentId, status: { $in: ['submitted', 'under_review', 'revision_requested', 'revised'] } }); const conflicts = []; for (const submission of activeSubmissions) { const submittedPub = publicationConfig.getPublicationById(submission.publicationId); if (!submittedPub) continue; // Check for market overlap const hasOverlap = this._checkMarketConflict(targetPub, submittedPub); if (hasOverlap) { conflicts.push({ publication: submittedPub.name, status: submission.status, submittedAt: submission.submittedAt, reason: 'Overlapping market/readership', severity: 'high' }); } } return { hasConflict: conflicts.length > 0, conflicts, allowed: conflicts.length === 0, message: conflicts.length > 0 ? `Cannot submit: Active submission to ${conflicts[0].publication} (${conflicts[0].status})` : 'No conflicts - safe to submit' }; } /** * Check if two publications have conflicting markets * @private */ _checkMarketConflict(pub1, pub2) { // Same publication = definite conflict if (pub1.id === pub2.id) return true; // Check readership overlap list if (pub1.market?.readershipOverlap?.includes(pub2.id)) return true; if (pub2.market?.readershipOverlap?.includes(pub1.id)) return true; // Global publications conflict with everything if (pub1.market?.reach === 'global' || pub2.market?.reach === 'global') { return true; } // Check regional overlap if (pub1.market?.regions && pub2.market?.regions) { const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions); return overlap > 0.5; // More than 50% overlap = conflict } // Unknown - assume conflict for safety return true; } /** * Check title similarity between two articles * @param {string} title1 - First title * @param {string} title2 - Second title * @returns {Object} Title similarity result */ analyzeTitleSimilarity(title1, title2) { const normalize = str => str.toLowerCase().replace(/[^a-z0-9\s]/g, '').trim(); const t1 = normalize(title1); const t2 = normalize(title2); // Exact match if (t1 === t2) { return { similarity: 1.0, pass: false, message: '🚫 Identical titles - one must be changed' }; } // Calculate word overlap const words1 = new Set(t1.split(/\s+/)); const words2 = new Set(t2.split(/\s+/)); // Remove common stop words const stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']); const meaningfulWords1 = new Set([...words1].filter(w => !stopWords.has(w))); const meaningfulWords2 = new Set([...words2].filter(w => !stopWords.has(w))); const intersection = new Set([...meaningfulWords1].filter(x => meaningfulWords2.has(x))); const union = new Set([...meaningfulWords1, ...meaningfulWords2]); const jaccardSimilarity = union.size > 0 ? intersection.size / union.size : 0; // Check for substring containment const containment = t1.includes(t2) || t2.includes(t1) ? 0.3 : 0; const finalSimilarity = Math.max(jaccardSimilarity, containment); // Threshold: 60% similarity is too high for high-profile publications const threshold = 0.6; const pass = finalSimilarity < threshold; let message; if (finalSimilarity >= 0.9) { message = '🚫 Nearly identical titles - change required'; } else if (finalSimilarity >= threshold) { message = '⚠️ Titles too similar for same-market publications - change one'; } else if (finalSimilarity >= 0.4) { message = '⚠️ Some title overlap - acceptable but consider variation'; } else { message = '✅ Titles sufficiently distinct'; } return { similarity: Math.round(finalSimilarity * 100) / 100, pass, threshold, sharedWords: [...intersection], message }; } } // Singleton instance let instance = null; module.exports = { getInstance: () => { if (!instance) { instance = new ContentSimilarityService(); } return instance; }, ContentSimilarityService };