#!/usr/bin/env node /** * Hasher Duplicate Remover Script * * This script finds and removes duplicate entries from Redis. * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields. * * Usage: * npx tsx scripts/remove-duplicates.ts [options] * npm run remove-duplicates [-- options] * * Options: * --dry-run Show duplicates without removing them (default) * --execute Actually remove the duplicates * --field= Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512) * --help, -h Show this help message */ import Redis from 'ioredis'; const REDIS_HOST = process.env.REDIS_HOST || 'localhost'; const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10); const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined; const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10); const INDEX_NAME = 'hasher'; interface ParsedArgs { dryRun: boolean; field: string | null; showHelp: boolean; } interface DuplicateGroup { value: string; field: string; plaintexts: string[]; keepPlaintext: string; deletePlaintexts: string[]; } interface HashDocument { plaintext: string; md5: string; sha1: string; sha256: string; sha512: string; created_at: string; } function parseArgs(args: string[]): ParsedArgs { const result: ParsedArgs = { dryRun: true, field: null, showHelp: false }; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--help' || arg === '-h') { result.showHelp = true; } else if (arg === '--dry-run') { result.dryRun = true; } else if (arg === '--execute') { result.dryRun = false; } else if (arg.startsWith('--field=')) { result.field = arg.split('=')[1]; } else if (arg === '--field') { const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { result.field = nextArg; i++; } } } return result; } function showHelp() { console.log(` Hasher Duplicate Remover Script Usage: npx tsx scripts/remove-duplicates.ts [options] npm run remove-duplicates [-- options] Options: --dry-run Show duplicates without removing them (default) --execute Actually remove the duplicates --field= Check duplicates only on this field Valid fields: plaintext, md5, sha1, sha256, sha512 --help, -h Show this help message Environment Variables: REDIS_HOST Redis host (default: localhost) REDIS_PORT Redis port (default: 6379) REDIS_PASSWORD Redis password (optional) REDIS_DB Redis database number (default: 0) Examples: npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates npx tsx scripts/remove-duplicates.ts --execute --field=plaintext Notes: - The script keeps the OLDEST document (by created_at) and removes newer duplicates - Always run with --dry-run first to review what will be deleted - Duplicates are checked across all hash fields by default `); process.exit(0); } async function findDuplicatesForField( client: Redis, field: string ): Promise { const duplicates: DuplicateGroup[] = []; console.log(` Scanning for ${field} duplicates...`); // Get all keys for this field type const pattern = field === 'plaintext' ? 'hash:plaintext:*' : `hash:index:${field}:*`; const keys = await client.keys(pattern); // For hash indexes, group by hash value (not plaintext) const valueMap = new Map(); if (field === 'plaintext') { // Each key is already unique for plaintext // Check for same plaintext with different created_at for (const key of keys) { const plaintext = key.replace('hash:plaintext:', ''); if (!valueMap.has(plaintext)) { valueMap.set(plaintext, []); } valueMap.get(plaintext)!.push(plaintext); } } else { // For hash fields, get the plaintext and check if multiple plaintexts have same hash for (const key of keys) { const hashValue = key.replace(`hash:index:${field}:`, ''); const plaintext = await client.get(key); if (plaintext) { if (!valueMap.has(hashValue)) { valueMap.set(hashValue, []); } valueMap.get(hashValue)!.push(plaintext); } } } // Find groups with duplicates for (const [value, plaintexts] of valueMap) { const uniquePlaintexts = Array.from(new Set(plaintexts)); if (uniquePlaintexts.length > 1) { // Get documents to compare timestamps const docs: { plaintext: string; doc: HashDocument }[] = []; for (const plaintext of uniquePlaintexts) { const docKey = `hash:plaintext:${plaintext}`; const docData = await client.get(docKey); if (docData) { docs.push({ plaintext, doc: JSON.parse(docData) }); } } // Sort by created_at (oldest first) docs.sort((a, b) => new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime() ); if (docs.length > 1) { duplicates.push({ value, field, plaintexts: docs.map(d => d.plaintext), keepPlaintext: docs[0].plaintext, deletePlaintexts: docs.slice(1).map(d => d.plaintext) }); } } } return duplicates; } async function removeDuplicates(parsedArgs: ParsedArgs) { const client = new Redis({ host: REDIS_HOST, port: REDIS_PORT, password: REDIS_PASSWORD, db: REDIS_DB, }); const fields = parsedArgs.field ? [parsedArgs.field] : ['md5', 'sha1', 'sha256', 'sha512']; console.log(`🔍 Hasher Duplicate Remover`); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`); console.log(`Index: ${INDEX_NAME}`); console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`); console.log(`Fields to check: ${fields.join(', ')}`); console.log(''); try { // Test connection console.log('🔗 Connecting to Redis...'); await client.ping(); console.log('✅ Connected successfully\n'); // Get index stats const stats = await client.hgetall('hash:stats'); const totalCount = parseInt(stats.count || '0', 10); console.log(`📊 Total documents in index: ${totalCount}\n`); const allDuplicates: DuplicateGroup[] = []; const seenPlaintexts = new Set(); // Find duplicates for each field for (const field of fields) { console.log(`🔍 Checking duplicates for field: ${field}...`); const fieldDuplicates = await findDuplicatesForField(client, field); // Filter out already seen plaintexts for (const dup of fieldDuplicates) { const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p)); if (newDeletePlaintexts.length > 0) { dup.deletePlaintexts = newDeletePlaintexts; newDeletePlaintexts.forEach(p => seenPlaintexts.add(p)); allDuplicates.push(dup); } } console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`); } const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0); console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`📋 Summary:`); console.log(` Duplicate groups found: ${allDuplicates.length}`); console.log(` Documents to delete: ${totalToDelete}`); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`); if (allDuplicates.length === 0) { console.log('✨ No duplicates found! Index is clean.\n'); await client.quit(); return; } // Show sample of duplicates console.log(`📝 Sample duplicates (showing first 10):\n`); const samplesToShow = allDuplicates.slice(0, 10); for (const dup of samplesToShow) { const truncatedValue = dup.value.length > 50 ? dup.value.substring(0, 50) + '...' : dup.value; console.log(` Field: ${dup.field}`); console.log(` Value: ${truncatedValue}`); console.log(` Keep: ${dup.keepPlaintext}`); console.log(` Delete: ${dup.deletePlaintexts.length} document(s)`); console.log(''); } if (allDuplicates.length > 10) { console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`); } if (parsedArgs.dryRun) { console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`🔎 DRY RUN - No changes made`); console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`); await client.quit(); return; } // Execute deletion console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`); let deleted = 0; let errors = 0; for (const dup of allDuplicates) { for (const plaintext of dup.deletePlaintexts) { try { const docKey = `hash:plaintext:${plaintext}`; const docData = await client.get(docKey); if (docData) { const doc: HashDocument = JSON.parse(docData); const pipeline = client.pipeline(); // Delete main document pipeline.del(docKey); // Delete all indexes pipeline.del(`hash:index:md5:${doc.md5}`); pipeline.del(`hash:index:sha1:${doc.sha1}`); pipeline.del(`hash:index:sha256:${doc.sha256}`); pipeline.del(`hash:index:sha512:${doc.sha512}`); // Update statistics pipeline.hincrby('hash:stats', 'count', -1); pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length); const results = await pipeline.exec(); if (results && results.some(([err]) => err !== null)) { errors++; } else { deleted++; } } process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`); } catch (error) { console.error(`\n❌ Error deleting ${plaintext}:`, error); errors++; } } } // Get new count const newStats = await client.hgetall('hash:stats'); const newCount = parseInt(newStats.count || '0', 10); console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('✅ Duplicate removal complete!'); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Documents deleted: ${deleted}`); console.log(`Errors: ${errors}`); console.log(`Previous document count: ${totalCount}`); console.log(`New document count: ${newCount}`); console.log(''); await client.quit(); } catch (error) { console.error('\n❌ Error:', error instanceof Error ? error.message : error); process.exit(1); } } // Parse command line arguments const args = process.argv.slice(2); const parsedArgs = parseArgs(args); if (parsedArgs.showHelp) { showHelp(); } // Validate field if provided const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']; if (parsedArgs.field && !validFields.includes(parsedArgs.field)) { console.error(`❌ Invalid field: ${parsedArgs.field}`); console.error(` Valid fields: ${validFields.join(', ')}`); process.exit(1); } console.log(`\n🔧 Configuration:`); console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`); if (parsedArgs.field) { console.log(` Field: ${parsedArgs.field}`); } else { console.log(` Fields: all (md5, sha1, sha256, sha512)`); } console.log(''); removeDuplicates(parsedArgs).catch(console.error);