- Fixed unused function parameters by prefixing with underscore - Removed unused imports and variables - Applied eslint --fix for automatic style fixes - Property shorthand - String template literals - Prefer const over let where appropriate - Spacing and formatting Reduces lint errors from 108+ to 78 (61 unused vars, 17 other issues) Related to CI lint failures in previous commit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
410 lines
14 KiB
JavaScript
410 lines
14 KiB
JavaScript
/**
|
|
* Content Similarity Service
|
|
* Analyzes semantic similarity between articles with market-aware adjustments
|
|
*/
|
|
|
|
const ClaudeAPIService = require('./ClaudeAPI.service');
|
|
const publicationConfig = require('../config/publication-targets.config');
|
|
const logger = require('../utils/logger.util');
|
|
|
|
class ContentSimilarityService {
|
|
/**
|
|
* Analyze semantic similarity between two articles
|
|
* @param {Object} article1 - First article
|
|
* @param {Object} article2 - Second article
|
|
* @param {Object} context - Optional context (target publication, etc.)
|
|
* @returns {Promise<Object>} Similarity analysis with market adjustments
|
|
*/
|
|
async analyzeSimilarity(article1, article2, context = {}) {
|
|
logger.info(`Analyzing similarity: "${article1.title}" vs "${article2.title}"`);
|
|
|
|
try {
|
|
const claudeAPI = ClaudeAPIService;
|
|
|
|
// Build system prompt for similarity analysis
|
|
const systemPrompt = `You are an expert content analyst evaluating semantic similarity between two articles.
|
|
|
|
Your task is to analyze:
|
|
1. **Core Argument Similarity**: Do they make the same fundamental argument?
|
|
2. **Evidence/Examples**: Do they use the same examples or evidence?
|
|
3. **Narrative/Framing**: Do they frame the issue the same way?
|
|
4. **Conclusions**: Do they reach the same conclusions?
|
|
5. **Unique Elements**: What makes each article distinct?
|
|
|
|
Return JSON:
|
|
{
|
|
"semanticSimilarity": 0.0-1.0,
|
|
"argumentSimilarity": 0.0-1.0,
|
|
"evidenceSimilarity": 0.0-1.0,
|
|
"framingSimilarity": 0.0-1.0,
|
|
"conclusionSimilarity": 0.0-1.0,
|
|
"uniqueElements": {
|
|
"article1": ["Unique aspect 1", "Unique aspect 2"],
|
|
"article2": ["Unique aspect 1", "Unique aspect 2"]
|
|
},
|
|
"reasoning": "Detailed explanation of similarity assessment",
|
|
"verdict": "distinct|similar|very-similar|duplicate"
|
|
}
|
|
|
|
Similarity scale:
|
|
- 0.0-0.3: Distinct (different arguments)
|
|
- 0.3-0.5: Some overlap (related topics, different angles)
|
|
- 0.5-0.7: Similar (same argument, different execution)
|
|
- 0.7-0.9: Very similar (same argument and approach)
|
|
- 0.9-1.0: Duplicate (essentially the same article)`;
|
|
|
|
const userPrompt = `Compare these two articles:
|
|
|
|
**Article 1: "${article1.title}"**
|
|
Word Count: ${article1.wordCount || 'unknown'}
|
|
|
|
${article1.content.substring(0, 3000)}${article1.content.length > 3000 ? '...' : ''}
|
|
|
|
---
|
|
|
|
**Article 2: "${article2.title}"**
|
|
Word Count: ${article2.wordCount || 'unknown'}
|
|
|
|
${article2.content.substring(0, 3000)}${article2.content.length > 3000 ? '...' : ''}
|
|
|
|
---
|
|
|
|
Analyze semantic similarity and provide detailed JSON response.`;
|
|
|
|
const messages = [{ role: 'user', content: userPrompt }];
|
|
|
|
const response = await claudeAPI.sendMessage(messages, {
|
|
system: systemPrompt,
|
|
max_tokens: 1500,
|
|
temperature: 0.2 // Low temperature for consistent analysis
|
|
});
|
|
|
|
const baseSimilarity = claudeAPI.extractJSON(response);
|
|
|
|
// Apply market-aware adjustments
|
|
const adjusted = this._applyMarketAdjustments(
|
|
baseSimilarity,
|
|
article1,
|
|
article2,
|
|
context
|
|
);
|
|
|
|
logger.info(`Similarity analysis complete: ${adjusted.finalSimilarity.toFixed(2)}`);
|
|
|
|
return adjusted;
|
|
|
|
} catch (error) {
|
|
logger.error('Similarity analysis error:', error);
|
|
throw new Error(`Failed to analyze similarity: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply market-aware adjustments to similarity score
|
|
* @private
|
|
*/
|
|
_applyMarketAdjustments(baseSimilarity, article1, article2, context) {
|
|
const adjustments = {
|
|
differentLanguage: 0,
|
|
differentRegion: 0,
|
|
differentCultural: 0,
|
|
differentAudience: 0,
|
|
isAdaptation: 0
|
|
};
|
|
|
|
const reasoning = [];
|
|
|
|
// Get publication market data
|
|
const pub1 = context.targetPublication1
|
|
? publicationConfig.getPublicationById(context.targetPublication1)
|
|
: null;
|
|
const pub2 = context.targetPublication2
|
|
? publicationConfig.getPublicationById(context.targetPublication2)
|
|
: null;
|
|
|
|
// Different language modifier
|
|
if (pub1?.market?.language && pub2?.market?.language) {
|
|
if (pub1.market.language !== pub2.market.language) {
|
|
adjustments.differentLanguage = -0.3;
|
|
reasoning.push(`Different languages (${pub1.market.language} vs ${pub2.market.language}) - major adjustment`);
|
|
}
|
|
}
|
|
|
|
// Different region modifier
|
|
if (pub1?.market && pub2?.market) {
|
|
const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions);
|
|
|
|
if (overlap === 0) {
|
|
adjustments.differentRegion = -0.2;
|
|
reasoning.push(`No regional overlap - content can be reused across markets`);
|
|
} else if (overlap < 0.3) {
|
|
adjustments.differentRegion = -0.1;
|
|
reasoning.push(`Minimal regional overlap (${(overlap * 100).toFixed(0)}%) - slight adjustment`);
|
|
} else if (overlap > 0.7) {
|
|
reasoning.push(`High regional overlap (${(overlap * 100).toFixed(0)}%) - similarity matters`);
|
|
}
|
|
}
|
|
|
|
// Different cultural context
|
|
if (pub1?.market?.culturalContext && pub2?.market?.culturalContext) {
|
|
if (pub1.market.culturalContext !== pub2.market.culturalContext &&
|
|
pub1.market.culturalContext !== 'universal' &&
|
|
pub2.market.culturalContext !== 'universal') {
|
|
adjustments.differentCultural = -0.15;
|
|
reasoning.push(`Different cultural contexts (${pub1.market.culturalContext} vs ${pub2.market.culturalContext})`);
|
|
}
|
|
}
|
|
|
|
// Different audience sophistication
|
|
if (pub1?.audience && pub2?.audience) {
|
|
const audienceOverlap = this._calculateAudienceOverlap(pub1.audience, pub2.audience);
|
|
if (audienceOverlap < 0.3) {
|
|
adjustments.differentAudience = -0.1;
|
|
reasoning.push(`Different target audiences - allows similar messaging with different framing`);
|
|
}
|
|
}
|
|
|
|
// Check if intentional adaptation
|
|
if (context.isAdaptation) {
|
|
adjustments.isAdaptation = -0.4;
|
|
reasoning.push(`Marked as intentional adaptation - high similarity expected and allowed`);
|
|
}
|
|
|
|
// Calculate final similarity
|
|
const totalAdjustment = Object.values(adjustments).reduce((sum, val) => sum + val, 0);
|
|
const finalSimilarity = Math.max(0, Math.min(1, baseSimilarity.semanticSimilarity + totalAdjustment));
|
|
|
|
// Determine if allowed
|
|
const threshold = context.similarityThreshold || 0.70;
|
|
const allowed = finalSimilarity < threshold;
|
|
|
|
return {
|
|
baseSimilarity: baseSimilarity.semanticSimilarity,
|
|
adjustments,
|
|
totalAdjustment,
|
|
finalSimilarity,
|
|
threshold,
|
|
allowed,
|
|
verdict: baseSimilarity.verdict,
|
|
reasoning: reasoning.join('; '),
|
|
detailedAnalysis: baseSimilarity,
|
|
recommendation: this._generateRecommendation(finalSimilarity, allowed, context)
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculate regional overlap between two publication markets
|
|
* @private
|
|
*/
|
|
_calculateRegionOverlap(regions1, regions2) {
|
|
if (!regions1 || !regions2) return 0.5; // Unknown, assume moderate overlap
|
|
|
|
// Handle global reach specially
|
|
if (regions1.includes('global') || regions2.includes('global')) {
|
|
return 1.0; // Global publications overlap with everything
|
|
}
|
|
|
|
const set1 = new Set(regions1);
|
|
const set2 = new Set(regions2);
|
|
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
|
|
if (set1.size === 0 || set2.size === 0) return 0;
|
|
|
|
return intersection.size / Math.max(set1.size, set2.size);
|
|
}
|
|
|
|
/**
|
|
* Calculate audience overlap
|
|
* @private
|
|
*/
|
|
_calculateAudienceOverlap(audience1, audience2) {
|
|
if (!audience1 || !audience2) return 0.5;
|
|
|
|
const set1 = new Set(audience1);
|
|
const set2 = new Set(audience2);
|
|
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
|
|
if (set1.size === 0 || set2.size === 0) return 0;
|
|
|
|
return intersection.size / Math.max(set1.size, set2.size);
|
|
}
|
|
|
|
/**
|
|
* Generate human-readable recommendation
|
|
* @private
|
|
*/
|
|
_generateRecommendation(similarity, allowed, context) {
|
|
if (similarity < 0.3) {
|
|
return '✅ These articles are distinct. Safe to publish both.';
|
|
} else if (similarity < 0.5) {
|
|
return '✅ Some thematic overlap, but different enough for both markets.';
|
|
} else if (similarity < 0.7) {
|
|
return allowed
|
|
? '⚠️ Moderately similar. Acceptable for different markets, but consider revisions for same market.'
|
|
: '⚠️ Moderately similar. Consider significant revisions to differentiate.';
|
|
} else if (similarity < 0.85) {
|
|
return allowed
|
|
? '⚠️ Very similar. Only acceptable due to different markets/languages. Ensure cultural adaptation.'
|
|
: '🚫 Very similar. Revisions required before submitting to overlapping markets.';
|
|
} else {
|
|
return allowed
|
|
? '⚠️ Nearly identical. Only acceptable as intentional translation/adaptation.'
|
|
: '🚫 Too similar. This would violate editorial standards and exclusivity requirements.';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check for submission conflicts
|
|
* @param {string} contentId - BlogPost ID
|
|
* @param {string} targetPublicationId - Target publication
|
|
* @param {Object} SubmissionTracking - Submission model
|
|
* @returns {Promise<Object>} Conflict analysis
|
|
*/
|
|
async checkSubmissionConflict(contentId, targetPublicationId, SubmissionTracking) {
|
|
const targetPub = publicationConfig.getPublicationById(targetPublicationId);
|
|
|
|
if (!targetPub) {
|
|
throw new Error(`Publication not found: ${targetPublicationId}`);
|
|
}
|
|
|
|
// Find active submissions for this content
|
|
const activeSubmissions = await SubmissionTracking.find({
|
|
blogPostId: contentId,
|
|
status: { $in: ['submitted', 'under_review', 'revision_requested', 'revised'] }
|
|
});
|
|
|
|
const conflicts = [];
|
|
|
|
for (const submission of activeSubmissions) {
|
|
const submittedPub = publicationConfig.getPublicationById(submission.publicationId);
|
|
|
|
if (!submittedPub) continue;
|
|
|
|
// Check for market overlap
|
|
const hasOverlap = this._checkMarketConflict(targetPub, submittedPub);
|
|
|
|
if (hasOverlap) {
|
|
conflicts.push({
|
|
publication: submittedPub.name,
|
|
status: submission.status,
|
|
submittedAt: submission.submittedAt,
|
|
reason: 'Overlapping market/readership',
|
|
severity: 'high'
|
|
});
|
|
}
|
|
}
|
|
|
|
return {
|
|
hasConflict: conflicts.length > 0,
|
|
conflicts,
|
|
allowed: conflicts.length === 0,
|
|
message: conflicts.length > 0
|
|
? `Cannot submit: Active submission to ${conflicts[0].publication} (${conflicts[0].status})`
|
|
: 'No conflicts - safe to submit'
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Check if two publications have conflicting markets
|
|
* @private
|
|
*/
|
|
_checkMarketConflict(pub1, pub2) {
|
|
// Same publication = definite conflict
|
|
if (pub1.id === pub2.id) return true;
|
|
|
|
// Check readership overlap list
|
|
if (pub1.market?.readershipOverlap?.includes(pub2.id)) return true;
|
|
if (pub2.market?.readershipOverlap?.includes(pub1.id)) return true;
|
|
|
|
// Global publications conflict with everything
|
|
if (pub1.market?.reach === 'global' || pub2.market?.reach === 'global') {
|
|
return true;
|
|
}
|
|
|
|
// Check regional overlap
|
|
if (pub1.market?.regions && pub2.market?.regions) {
|
|
const overlap = this._calculateRegionOverlap(pub1.market.regions, pub2.market.regions);
|
|
return overlap > 0.5; // More than 50% overlap = conflict
|
|
}
|
|
|
|
// Unknown - assume conflict for safety
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Check title similarity between two articles
|
|
* @param {string} title1 - First title
|
|
* @param {string} title2 - Second title
|
|
* @returns {Object} Title similarity result
|
|
*/
|
|
analyzeTitleSimilarity(title1, title2) {
|
|
const normalize = str => str.toLowerCase().replace(/[^a-z0-9\s]/g, '').trim();
|
|
|
|
const t1 = normalize(title1);
|
|
const t2 = normalize(title2);
|
|
|
|
// Exact match
|
|
if (t1 === t2) {
|
|
return {
|
|
similarity: 1.0,
|
|
pass: false,
|
|
message: '🚫 Identical titles - one must be changed'
|
|
};
|
|
}
|
|
|
|
// Calculate word overlap
|
|
const words1 = new Set(t1.split(/\s+/));
|
|
const words2 = new Set(t2.split(/\s+/));
|
|
|
|
// Remove common stop words
|
|
const stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']);
|
|
const meaningfulWords1 = new Set([...words1].filter(w => !stopWords.has(w)));
|
|
const meaningfulWords2 = new Set([...words2].filter(w => !stopWords.has(w)));
|
|
|
|
const intersection = new Set([...meaningfulWords1].filter(x => meaningfulWords2.has(x)));
|
|
const union = new Set([...meaningfulWords1, ...meaningfulWords2]);
|
|
|
|
const jaccardSimilarity = union.size > 0 ? intersection.size / union.size : 0;
|
|
|
|
// Check for substring containment
|
|
const containment = t1.includes(t2) || t2.includes(t1) ? 0.3 : 0;
|
|
|
|
const finalSimilarity = Math.max(jaccardSimilarity, containment);
|
|
|
|
// Threshold: 60% similarity is too high for high-profile publications
|
|
const threshold = 0.6;
|
|
const pass = finalSimilarity < threshold;
|
|
|
|
let message;
|
|
if (finalSimilarity >= 0.9) {
|
|
message = '🚫 Nearly identical titles - change required';
|
|
} else if (finalSimilarity >= threshold) {
|
|
message = '⚠️ Titles too similar for same-market publications - change one';
|
|
} else if (finalSimilarity >= 0.4) {
|
|
message = '⚠️ Some title overlap - acceptable but consider variation';
|
|
} else {
|
|
message = '✅ Titles sufficiently distinct';
|
|
}
|
|
|
|
return {
|
|
similarity: Math.round(finalSimilarity * 100) / 100,
|
|
pass,
|
|
threshold,
|
|
sharedWords: [...intersection],
|
|
message
|
|
};
|
|
}
|
|
}
|
|
|
|
// Singleton instance
|
|
let instance = null;
|
|
|
|
module.exports = {
|
|
getInstance: () => {
|
|
if (!instance) {
|
|
instance = new ContentSimilarityService();
|
|
}
|
|
return instance;
|
|
},
|
|
ContentSimilarityService
|
|
};
|