tractatus/src/services/ContentSimilarity.service.js
TheFlow 7f6192cbd6 refactor(lint): fix code style and unused variables across src/
- Fixed unused function parameters by prefixing with underscore
- Removed unused imports and variables
- Applied eslint --fix for automatic style fixes
  - Property shorthand
  - String template literals
  - Prefer const over let where appropriate
  - Spacing and formatting

Reduces lint errors from 108+ to 78 (61 unused vars, 17 other issues)

Related to CI lint failures in previous commit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-24 20:15:26 +13:00

410 lines
14 KiB
JavaScript

/**
* Content Similarity Service
* Analyzes semantic similarity between articles with market-aware adjustments
*/
const ClaudeAPIService = require('./ClaudeAPI.service');
const publicationConfig = require('../config/publication-targets.config');
const logger = require('../utils/logger.util');
class ContentSimilarityService {
/**
* Analyze semantic similarity between two articles
* @param {Object} article1 - First article
* @param {Object} article2 - Second article
* @param {Object} context - Optional context (target publication, etc.)
* @returns {Promise<Object>} Similarity analysis with market adjustments
*/
async analyzeSimilarity(article1, article2, context = {}) {
logger.info(`Analyzing similarity: "${article1.title}" vs "${article2.title}"`);
try {
const claudeAPI = ClaudeAPIService;
// Build system prompt for similarity analysis
const systemPrompt = `You are an expert content analyst evaluating semantic similarity between two articles.
Your task is to analyze:
1. **Core Argument Similarity**: Do they make the same fundamental argument?
2. **Evidence/Examples**: Do they use the same examples or evidence?
3. **Narrative/Framing**: Do they frame the issue the same way?
4. **Conclusions**: Do they reach the same conclusions?
5. **Unique Elements**: What makes each article distinct?
Return JSON:
{
"semanticSimilarity": 0.0-1.0,
"argumentSimilarity": 0.0-1.0,
"evidenceSimilarity": 0.0-1.0,
"framingSimilarity": 0.0-1.0,
"conclusionSimilarity": 0.0-1.0,
"uniqueElements": {
"article1": ["Unique aspect 1", "Unique aspect 2"],
"article2": ["Unique aspect 1", "Unique aspect 2"]
},
"reasoning": "Detailed explanation of similarity assessment",
"verdict": "distinct|similar|very-similar|duplicate"
}
Similarity scale:
- 0.0-0.3: Distinct (different arguments)
- 0.3-0.5: Some overlap (related topics, different angles)
- 0.5-0.7: Similar (same argument, different execution)
- 0.7-0.9: Very similar (same argument and approach)
- 0.9-1.0: Duplicate (essentially the same article)`;
const userPrompt = `Compare these two articles:
**Article 1: "${article1.title}"**
Word Count: ${article1.wordCount || 'unknown'}
${article1.content.substring(0, 3000)}${article1.content.length > 3000 ? '...' : ''}
---
**Article 2: "${article2.title}"**
Word Count: ${article2.wordCount || 'unknown'}
${article2.content.substring(0, 3000)}${article2.content.length > 3000 ? '...' : ''}
---
Analyze semantic similarity and provide detailed JSON response.`;
const messages = [{ role: 'user', content: userPrompt }];
const response = await claudeAPI.sendMessage(messages, {
system: systemPrompt,
max_tokens: 1500,
temperature: 0.2 // Low temperature for consistent analysis
});
const baseSimilarity = claudeAPI.extractJSON(response);
// Apply market-aware adjustments
const adjusted = this._applyMarketAdjustments(
baseSimilarity,
article1,
article2,
context
);
logger.info(`Similarity analysis complete: ${adjusted.finalSimilarity.toFixed(2)}`);
return adjusted;
} catch (error) {
logger.error('Similarity analysis error:', error);
throw new Error(`Failed to analyze similarity: ${error.message}`);
}
}
/**
* Apply market-aware adjustments to similarity score
* @private
*/
_applyMarketAdjustments(baseSimilarity, article1, article2, context) {
const adjustments = {
differentLanguage: 0,
differentRegion: 0,
differentCultural: 0,
differentAudience: 0,
isAdaptation: 0
};
const reasoning = [];
// Get publication market data
const pub1 = context.targetPublication1
? publicationConfig.getPublicationById(context.targetPublication1)
: null;
const pub2 = context.targetPublication2
? publicationConfig.getPublicationById(context.targetPublication2)
: null;
// Different language modifier
if (pub1?.market?.language && pub2?.market?.language) {
if (pub1.market.language !== pub2.market.language) {
adjustments.differentLanguage = -0.3;
reasoning.push(`Different languages (${pub1.market.language} vs ${pub2.market.language}) - major adjustment`);
}
}
// Different region modifier
if (pub1?.market && pub2?.market) {
const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions);
if (overlap === 0) {
adjustments.differentRegion = -0.2;
reasoning.push(`No regional overlap - content can be reused across markets`);
} else if (overlap < 0.3) {
adjustments.differentRegion = -0.1;
reasoning.push(`Minimal regional overlap (${(overlap * 100).toFixed(0)}%) - slight adjustment`);
} else if (overlap > 0.7) {
reasoning.push(`High regional overlap (${(overlap * 100).toFixed(0)}%) - similarity matters`);
}
}
// Different cultural context
if (pub1?.market?.culturalContext && pub2?.market?.culturalContext) {
if (pub1.market.culturalContext !== pub2.market.culturalContext &&
pub1.market.culturalContext !== 'universal' &&
pub2.market.culturalContext !== 'universal') {
adjustments.differentCultural = -0.15;
reasoning.push(`Different cultural contexts (${pub1.market.culturalContext} vs ${pub2.market.culturalContext})`);
}
}
// Different audience sophistication
if (pub1?.audience && pub2?.audience) {
const audienceOverlap = this._calculateAudienceOverlap(pub1.audience, pub2.audience);
if (audienceOverlap < 0.3) {
adjustments.differentAudience = -0.1;
reasoning.push(`Different target audiences - allows similar messaging with different framing`);
}
}
// Check if intentional adaptation
if (context.isAdaptation) {
adjustments.isAdaptation = -0.4;
reasoning.push(`Marked as intentional adaptation - high similarity expected and allowed`);
}
// Calculate final similarity
const totalAdjustment = Object.values(adjustments).reduce((sum, val) => sum + val, 0);
const finalSimilarity = Math.max(0, Math.min(1, baseSimilarity.semanticSimilarity + totalAdjustment));
// Determine if allowed
const threshold = context.similarityThreshold || 0.70;
const allowed = finalSimilarity < threshold;
return {
baseSimilarity: baseSimilarity.semanticSimilarity,
adjustments,
totalAdjustment,
finalSimilarity,
threshold,
allowed,
verdict: baseSimilarity.verdict,
reasoning: reasoning.join('; '),
detailedAnalysis: baseSimilarity,
recommendation: this._generateRecommendation(finalSimilarity, allowed, context)
};
}
/**
* Calculate regional overlap between two publication markets
* @private
*/
_calculateRegionOverlap(regions1, regions2) {
if (!regions1 || !regions2) return 0.5; // Unknown, assume moderate overlap
// Handle global reach specially
if (regions1.includes('global') || regions2.includes('global')) {
return 1.0; // Global publications overlap with everything
}
const set1 = new Set(regions1);
const set2 = new Set(regions2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
if (set1.size === 0 || set2.size === 0) return 0;
return intersection.size / Math.max(set1.size, set2.size);
}
/**
* Calculate audience overlap
* @private
*/
_calculateAudienceOverlap(audience1, audience2) {
if (!audience1 || !audience2) return 0.5;
const set1 = new Set(audience1);
const set2 = new Set(audience2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
if (set1.size === 0 || set2.size === 0) return 0;
return intersection.size / Math.max(set1.size, set2.size);
}
/**
* Generate human-readable recommendation
* @private
*/
_generateRecommendation(similarity, allowed, context) {
if (similarity < 0.3) {
return '✅ These articles are distinct. Safe to publish both.';
} else if (similarity < 0.5) {
return '✅ Some thematic overlap, but different enough for both markets.';
} else if (similarity < 0.7) {
return allowed
? '⚠️ Moderately similar. Acceptable for different markets, but consider revisions for same market.'
: '⚠️ Moderately similar. Consider significant revisions to differentiate.';
} else if (similarity < 0.85) {
return allowed
? '⚠️ Very similar. Only acceptable due to different markets/languages. Ensure cultural adaptation.'
: '🚫 Very similar. Revisions required before submitting to overlapping markets.';
} else {
return allowed
? '⚠️ Nearly identical. Only acceptable as intentional translation/adaptation.'
: '🚫 Too similar. This would violate editorial standards and exclusivity requirements.';
}
}
/**
* Check for submission conflicts
* @param {string} contentId - BlogPost ID
* @param {string} targetPublicationId - Target publication
* @param {Object} SubmissionTracking - Submission model
* @returns {Promise<Object>} Conflict analysis
*/
async checkSubmissionConflict(contentId, targetPublicationId, SubmissionTracking) {
const targetPub = publicationConfig.getPublicationById(targetPublicationId);
if (!targetPub) {
throw new Error(`Publication not found: ${targetPublicationId}`);
}
// Find active submissions for this content
const activeSubmissions = await SubmissionTracking.find({
blogPostId: contentId,
status: { $in: ['submitted', 'under_review', 'revision_requested', 'revised'] }
});
const conflicts = [];
for (const submission of activeSubmissions) {
const submittedPub = publicationConfig.getPublicationById(submission.publicationId);
if (!submittedPub) continue;
// Check for market overlap
const hasOverlap = this._checkMarketConflict(targetPub, submittedPub);
if (hasOverlap) {
conflicts.push({
publication: submittedPub.name,
status: submission.status,
submittedAt: submission.submittedAt,
reason: 'Overlapping market/readership',
severity: 'high'
});
}
}
return {
hasConflict: conflicts.length > 0,
conflicts,
allowed: conflicts.length === 0,
message: conflicts.length > 0
? `Cannot submit: Active submission to ${conflicts[0].publication} (${conflicts[0].status})`
: 'No conflicts - safe to submit'
};
}
/**
* Check if two publications have conflicting markets
* @private
*/
_checkMarketConflict(pub1, pub2) {
// Same publication = definite conflict
if (pub1.id === pub2.id) return true;
// Check readership overlap list
if (pub1.market?.readershipOverlap?.includes(pub2.id)) return true;
if (pub2.market?.readershipOverlap?.includes(pub1.id)) return true;
// Global publications conflict with everything
if (pub1.market?.reach === 'global' || pub2.market?.reach === 'global') {
return true;
}
// Check regional overlap
if (pub1.market?.regions && pub2.market?.regions) {
const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions);
return overlap > 0.5; // More than 50% overlap = conflict
}
// Unknown - assume conflict for safety
return true;
}
/**
* Check title similarity between two articles
* @param {string} title1 - First title
* @param {string} title2 - Second title
* @returns {Object} Title similarity result
*/
analyzeTitleSimilarity(title1, title2) {
const normalize = str => str.toLowerCase().replace(/[^a-z0-9\s]/g, '').trim();
const t1 = normalize(title1);
const t2 = normalize(title2);
// Exact match
if (t1 === t2) {
return {
similarity: 1.0,
pass: false,
message: '🚫 Identical titles - one must be changed'
};
}
// Calculate word overlap
const words1 = new Set(t1.split(/\s+/));
const words2 = new Set(t2.split(/\s+/));
// Remove common stop words
const stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']);
const meaningfulWords1 = new Set([...words1].filter(w => !stopWords.has(w)));
const meaningfulWords2 = new Set([...words2].filter(w => !stopWords.has(w)));
const intersection = new Set([...meaningfulWords1].filter(x => meaningfulWords2.has(x)));
const union = new Set([...meaningfulWords1, ...meaningfulWords2]);
const jaccardSimilarity = union.size > 0 ? intersection.size / union.size : 0;
// Check for substring containment
const containment = t1.includes(t2) || t2.includes(t1) ? 0.3 : 0;
const finalSimilarity = Math.max(jaccardSimilarity, containment);
// Threshold: 60% similarity is too high for high-profile publications
const threshold = 0.6;
const pass = finalSimilarity < threshold;
let message;
if (finalSimilarity >= 0.9) {
message = '🚫 Nearly identical titles - change required';
} else if (finalSimilarity >= threshold) {
message = '⚠️ Titles too similar for same-market publications - change one';
} else if (finalSimilarity >= 0.4) {
message = '⚠️ Some title overlap - acceptable but consider variation';
} else {
message = '✅ Titles sufficiently distinct';
}
return {
similarity: Math.round(finalSimilarity * 100) / 100,
pass,
threshold,
sharedWords: [...intersection],
message
};
}
}
// Singleton instance
let instance = null;
module.exports = {
getInstance: () => {
if (!instance) {
instance = new ContentSimilarityService();
}
return instance;
},
ContentSimilarityService
};