#!/usr/bin/env node /** * Hasher Duplicate Remover Script * * This script finds and removes duplicate entries from Redis. * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields. * * Usage: * npx tsx scripts/remove-duplicates.ts [options] * npm run remove-duplicates [-- options] * * Options: * --dry-run Show duplicates without removing them (default) * --execute Actually remove the duplicates * --batch-size= Number of keys to scan in each batch (default: 1000) * --field= Check duplicates only on this field (md5, sha1, sha256, sha512) * --help, -h Show this help message */ import Redis from 'ioredis'; const REDIS_HOST = process.env.REDIS_HOST || 'localhost'; const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10); const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined; const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10); const DEFAULT_BATCH_SIZE = 1000; interface HashDocument { plaintext: string; md5: string; sha1: string; sha256: string; sha512: string; created_at: string; } interface ParsedArgs { dryRun: boolean; batchSize: number; field: string | null; showHelp: boolean; } interface DuplicateGroup { value: string; field: string; plaintexts: string[]; keepPlaintext: string; deletePlaintexts: string[]; } function parseArgs(args: string[]): ParsedArgs { const result: ParsedArgs = { dryRun: true, batchSize: DEFAULT_BATCH_SIZE, field: null, showHelp: false }; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--help' || arg === '-h') { result.showHelp = true; } else if (arg === '--dry-run') { result.dryRun = true; } else if (arg === '--execute') { result.dryRun = false; } else if (arg.startsWith('--batch-size=')) { const value = arg.split('=')[1]; const parsed = parseInt(value, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; } } else if (arg === '--batch-size') { const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { const parsed = parseInt(nextArg, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; i++; } } } else if (arg.startsWith('--field=')) { result.field = arg.split('=')[1]; } else if (arg === '--field') { const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { result.field = nextArg; i++; } } } return result; } function showHelp() { console.log(` Hasher Duplicate Remover Script Usage: npx tsx scripts/remove-duplicates.ts [options] npm run remove-duplicates [-- options] Options: --dry-run Show duplicates without removing them (default) --execute Actually remove the duplicates --batch-size= Number of keys to scan in each batch (default: 1000) --field= Check duplicates only on this field Valid fields: md5, sha1, sha256, sha512 --help, -h Show this help message Environment Variables: REDIS_HOST Redis host (default: localhost) REDIS_PORT Redis port (default: 6379) REDIS_PASSWORD Redis password (optional) REDIS_DB Redis database number (default: 0) Examples: # Dry run (show duplicates only) npm run remove-duplicates # Actually remove duplicates npm run remove-duplicates -- --execute # Check only MD5 duplicates npm run remove-duplicates -- --field=md5 --execute Description: This script scans through all hash documents in Redis and identifies duplicates based on hash values. When duplicates are found, it keeps the oldest entry (by created_at) and marks the rest for deletion. `); } async function findDuplicatesForField( client: Redis, field: 'md5' | 'sha1' | 'sha256' | 'sha512', batchSize: number ): Promise { const pattern = `hash:index:${field}:*`; const hashToPlaintexts: Map = new Map(); console.log(`šŸ” Scanning ${field} indexes...`); let cursor = '0'; let keysScanned = 0; do { const [nextCursor, keys] = await client.scan(cursor, 'MATCH', pattern, 'COUNT', batchSize); cursor = nextCursor; keysScanned += keys.length; for (const key of keys) { const hash = key.replace(`hash:index:${field}:`, ''); const plaintext = await client.get(key); if (plaintext) { if (!hashToPlaintexts.has(hash)) { hashToPlaintexts.set(hash, []); } hashToPlaintexts.get(hash)!.push(plaintext); } } process.stdout.write(`\r Keys scanned: ${keysScanned} `); } while (cursor !== '0'); console.log(''); const duplicates: DuplicateGroup[] = []; for (const [hash, plaintexts] of hashToPlaintexts.entries()) { if (plaintexts.length > 1) { // Fetch documents to get created_at timestamps const docs = await Promise.all( plaintexts.map(async (pt) => { const data = await client.get(`hash:plaintext:${pt}`); return data ? JSON.parse(data) as HashDocument : null; }) ); const validDocs = docs.filter((doc): doc is HashDocument => doc !== null); if (validDocs.length > 1) { // Sort by created_at, keep oldest validDocs.sort((a, b) => a.created_at.localeCompare(b.created_at)); duplicates.push({ value: hash, field, plaintexts: validDocs.map(d => d.plaintext), keepPlaintext: validDocs[0].plaintext, deletePlaintexts: validDocs.slice(1).map(d => d.plaintext) }); } } } return duplicates; } async function removeDuplicates( client: Redis, duplicates: DuplicateGroup[], dryRun: boolean ): Promise<{ deleted: number; errors: number }> { let deleted = 0; let errors = 0; console.log(''); console.log(`${dryRun ? 'šŸ” DRY RUN - Would delete:' : 'šŸ—‘ļø Deleting duplicates...'}`); console.log(''); for (const dup of duplicates) { console.log(`Duplicate ${dup.field}: ${dup.value}`); console.log(` Keep: ${dup.keepPlaintext} (oldest)`); console.log(` Delete: ${dup.deletePlaintexts.join(', ')}`); if (!dryRun) { for (const plaintext of dup.deletePlaintexts) { try { const docKey = `hash:plaintext:${plaintext}`; const docData = await client.get(docKey); if (docData) { const doc: HashDocument = JSON.parse(docData); const pipeline = client.pipeline(); // Delete the main document pipeline.del(docKey); // Delete all indexes pipeline.del(`hash:index:md5:${doc.md5}`); pipeline.del(`hash:index:sha1:${doc.sha1}`); pipeline.del(`hash:index:sha256:${doc.sha256}`); pipeline.del(`hash:index:sha512:${doc.sha512}`); // Update statistics pipeline.hincrby('hash:stats', 'count', -1); pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length); const results = await pipeline.exec(); if (results && results.some(([err]) => err !== null)) { errors++; } else { deleted++; } } } catch (error) { console.error(` Error deleting ${plaintext}:`, error); errors++; } } } console.log(''); } return { deleted, errors }; } async function main() { const args = process.argv.slice(2); const parsed = parseArgs(args); if (parsed.showHelp) { showHelp(); process.exit(0); } const validFields: Array<'md5' | 'sha1' | 'sha256' | 'sha512'> = ['md5', 'sha1', 'sha256', 'sha512']; const fieldsToCheck = parsed.field ? [parsed.field as 'md5' | 'sha1' | 'sha256' | 'sha512'] : validFields; // Validate field if (parsed.field && !validFields.includes(parsed.field as any)) { console.error(`āŒ Invalid field: ${parsed.field}`); console.error(` Valid fields: ${validFields.join(', ')}`); process.exit(1); } const client = new Redis({ host: REDIS_HOST, port: REDIS_PORT, password: REDIS_PASSWORD, db: REDIS_DB, }); console.log(''); console.log('šŸ” Hasher Duplicate Remover'); console.log('━'.repeat(42)); console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT}`); console.log(`Mode: ${parsed.dryRun ? 'DRY RUN' : 'EXECUTE'}`); console.log(`Batch size: ${parsed.batchSize}`); console.log(`Fields to check: ${fieldsToCheck.join(', ')}`); console.log(''); try { console.log('šŸ”— Connecting to Redis...'); await client.ping(); console.log('āœ… Connected successfully\n'); const allDuplicates: DuplicateGroup[] = []; for (const field of fieldsToCheck) { const duplicates = await findDuplicatesForField(client, field, parsed.batchSize); allDuplicates.push(...duplicates); console.log(` Found ${duplicates.length} duplicate groups for ${field}`); } console.log(''); console.log(`šŸ“Š Total duplicate groups found: ${allDuplicates.length}`); if (allDuplicates.length === 0) { console.log('āœ… No duplicates found!'); } else { const totalToDelete = allDuplicates.reduce( (sum, dup) => sum + dup.deletePlaintexts.length, 0 ); console.log(` Total documents to delete: ${totalToDelete}`); const { deleted, errors } = await removeDuplicates(client, allDuplicates, parsed.dryRun); if (!parsed.dryRun) { console.log('━'.repeat(42)); console.log('āœ… Removal complete!'); console.log(''); console.log('šŸ“Š Statistics:'); console.log(` Deleted: ${deleted}`); console.log(` Errors: ${errors}`); } else { console.log('━'.repeat(42)); console.log('šŸ’” This was a dry run. Use --execute to actually remove duplicates.'); } } await client.quit(); } catch (error) { console.error('\n\nāŒ Error:', error); await client.quit(); process.exit(1); } } main();