hasher/scripts/remove-duplicates.ts

#!/usr/bin/env node

/**
 * Hasher Duplicate Remover Script
 *
 * This script finds and removes duplicate entries from Redis.
 * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
 *
 * Usage:
 *   npx tsx scripts/remove-duplicates.ts [options]
 *   npm run remove-duplicates [-- options]
 *
 * Options:
 *   --dry-run              Show duplicates without removing them (default)
 *   --execute              Actually remove the duplicates
 *   --field=<field>        Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
 *   --help, -h             Show this help message
 */

import Redis from 'ioredis';

const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const INDEX_NAME = 'hasher';

interface ParsedArgs {
  dryRun: boolean;
  field: string | null;
  showHelp: boolean;
}

interface DuplicateGroup {
  value: string;
  field: string;
  plaintexts: string[];
  keepPlaintext: string;
  deletePlaintexts: string[];
}

interface HashDocument {
  plaintext: string;
  md5: string;
  sha1: string;
  sha256: string;
  sha512: string;
  created_at: string;
}

function parseArgs(args: string[]): ParsedArgs {
  const result: ParsedArgs = {
    dryRun: true,
    field: null,
    showHelp: false
  };

  for (let i = 0; i < args.length; i++) {
    const arg = args[i];

    if (arg === '--help' || arg === '-h') {
      result.showHelp = true;
    } else if (arg === '--dry-run') {
      result.dryRun = true;
    } else if (arg === '--execute') {
      result.dryRun = false;
    } else if (arg.startsWith('--field=')) {
      result.field = arg.split('=')[1];
    } else if (arg === '--field') {
      const nextArg = args[i + 1];
      if (nextArg && !nextArg.startsWith('-')) {
        result.field = nextArg;
        i++;
      }
    }
  }

  return result;
}

function showHelp() {
  console.log(`
Hasher Duplicate Remover Script

Usage:
  npx tsx scripts/remove-duplicates.ts [options]
  npm run remove-duplicates [-- options]

Options:
  --dry-run              Show duplicates without removing them (default)
  --execute              Actually remove the duplicates
  --field=<field>        Check duplicates only on this field
                         Valid fields: plaintext, md5, sha1, sha256, sha512
  --help, -h             Show this help message

Environment Variables:
  REDIS_HOST             Redis host (default: localhost)
  REDIS_PORT             Redis port (default: 6379)
  REDIS_PASSWORD         Redis password (optional)
  REDIS_DB               Redis database number (default: 0)

Examples:
  npx tsx scripts/remove-duplicates.ts                    # Dry run, show all duplicates
  npx tsx scripts/remove-duplicates.ts --execute          # Remove all duplicates
  npx tsx scripts/remove-duplicates.ts --field=md5        # Check only md5 duplicates
  npx tsx scripts/remove-duplicates.ts --execute --field=plaintext

Notes:
  - The script keeps the OLDEST document (by created_at) and removes newer duplicates
  - Always run with --dry-run first to review what will be deleted
  - Duplicates are checked across all hash fields by default
`);
  process.exit(0);
}

async function findDuplicatesForField(
  client: Redis,
  field: string
): Promise<DuplicateGroup[]> {
  const duplicates: DuplicateGroup[] = [];

  console.log(`   Scanning for ${field} duplicates...`);

  // Get all keys for this field type
  const pattern = field === 'plaintext'
    ? 'hash:plaintext:*'
    : `hash:index:${field}:*`;

  const keys = await client.keys(pattern);

  // For hash indexes, group by hash value (not plaintext)
  const valueMap = new Map<string, string[]>();

  if (field === 'plaintext') {
    // Each key is already unique for plaintext
    // Check for same plaintext with different created_at
    for (const key of keys) {
      const plaintext = key.replace('hash:plaintext:', '');
      if (!valueMap.has(plaintext)) {
        valueMap.set(plaintext, []);
      }
      valueMap.get(plaintext)!.push(plaintext);
    }
  } else {
    // For hash fields, get the plaintext and check if multiple plaintexts have same hash
    for (const key of keys) {
      const hashValue = key.replace(`hash:index:${field}:`, '');
      const plaintext = await client.get(key);

      if (plaintext) {
        if (!valueMap.has(hashValue)) {
          valueMap.set(hashValue, []);
        }
        valueMap.get(hashValue)!.push(plaintext);
      }
    }
  }

  // Find groups with duplicates
  for (const [value, plaintexts] of valueMap) {
    const uniquePlaintexts = Array.from(new Set(plaintexts));

    if (uniquePlaintexts.length > 1) {
      // Get documents to compare timestamps
      const docs: { plaintext: string; doc: HashDocument }[] = [];

      for (const plaintext of uniquePlaintexts) {
        const docKey = `hash:plaintext:${plaintext}`;
        const docData = await client.get(docKey);
        if (docData) {
          docs.push({ plaintext, doc: JSON.parse(docData) });
        }
      }

      // Sort by created_at (oldest first)
      docs.sort((a, b) =>
        new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
      );

      if (docs.length > 1) {
        duplicates.push({
          value,
          field,
          plaintexts: docs.map(d => d.plaintext),
          keepPlaintext: docs[0].plaintext,
          deletePlaintexts: docs.slice(1).map(d => d.plaintext)
        });
      }
    }
  }

  return duplicates;
}

async function removeDuplicates(parsedArgs: ParsedArgs) {
  const client = new Redis({
    host: REDIS_HOST,
    port: REDIS_PORT,
    password: REDIS_PASSWORD,
    db: REDIS_DB,
  });

  const fields = parsedArgs.field
    ? [parsedArgs.field]
    : ['md5', 'sha1', 'sha256', 'sha512'];

  console.log(`🔍 Hasher Duplicate Remover`);
  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
  console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
  console.log(`Index: ${INDEX_NAME}`);
  console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️  EXECUTE (will delete)'}`);
  console.log(`Fields to check: ${fields.join(', ')}`);
  console.log('');

  try {
    // Test connection
    console.log('🔗 Connecting to Redis...');
    await client.ping();
    console.log('✅ Connected successfully\n');

    // Get index stats
    const stats = await client.hgetall('hash:stats');
    const totalCount = parseInt(stats.count || '0', 10);
    console.log(`📊 Total documents in index: ${totalCount}\n`);

    const allDuplicates: DuplicateGroup[] = [];
    const seenPlaintexts = new Set<string>();

    // Find duplicates for each field
    for (const field of fields) {
      console.log(`🔍 Checking duplicates for field: ${field}...`);
      const fieldDuplicates = await findDuplicatesForField(client, field);

      // Filter out already seen plaintexts
      for (const dup of fieldDuplicates) {
        const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
        if (newDeletePlaintexts.length > 0) {
          dup.deletePlaintexts = newDeletePlaintexts;
          newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
          allDuplicates.push(dup);
        }
      }

      console.log(`   Found ${fieldDuplicates.length} duplicate groups for ${field}`);
    }

    const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);

    console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
    console.log(`📋 Summary:`);
    console.log(`   Duplicate groups found: ${allDuplicates.length}`);
    console.log(`   Documents to delete: ${totalToDelete}`);
    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);

    if (allDuplicates.length === 0) {
      console.log('✨ No duplicates found! Index is clean.\n');
      await client.quit();
      return;
    }

    // Show sample of duplicates
    console.log(`📝 Sample duplicates (showing first 10):\n`);
    const samplesToShow = allDuplicates.slice(0, 10);
    for (const dup of samplesToShow) {
      const truncatedValue = dup.value.length > 50
        ? dup.value.substring(0, 50) + '...'
        : dup.value;
      console.log(`   Field: ${dup.field}`);
      console.log(`   Value: ${truncatedValue}`);
      console.log(`   Keep: ${dup.keepPlaintext}`);
      console.log(`   Delete: ${dup.deletePlaintexts.length} document(s)`);
      console.log('');
    }

    if (allDuplicates.length > 10) {
      console.log(`   ... and ${allDuplicates.length - 10} more duplicate groups\n`);
    }

    if (parsedArgs.dryRun) {
      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
      console.log(`🔎 DRY RUN - No changes made`);
      console.log(`   Run with --execute to remove ${totalToDelete} duplicate documents`);
      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
      await client.quit();
      return;
    }

    // Execute deletion
    console.log(`\n🗑️  Removing ${totalToDelete} duplicate documents...\n`);

    let deleted = 0;
    let errors = 0;

    for (const dup of allDuplicates) {
      for (const plaintext of dup.deletePlaintexts) {
        try {
          const docKey = `hash:plaintext:${plaintext}`;
          const docData = await client.get(docKey);

          if (docData) {
            const doc: HashDocument = JSON.parse(docData);
            const pipeline = client.pipeline();

            // Delete main document
            pipeline.del(docKey);

            // Delete all indexes
            pipeline.del(`hash:index:md5:${doc.md5}`);
            pipeline.del(`hash:index:sha1:${doc.sha1}`);
            pipeline.del(`hash:index:sha256:${doc.sha256}`);
            pipeline.del(`hash:index:sha512:${doc.sha512}`);

            // Update statistics
            pipeline.hincrby('hash:stats', 'count', -1);
            pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);

            const results = await pipeline.exec();

            if (results && results.some(([err]) => err !== null)) {
              errors++;
            } else {
              deleted++;
            }
          }

          process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
        } catch (error) {
          console.error(`\n❌ Error deleting ${plaintext}:`, error);
          errors++;
        }
      }
    }

    // Get new count
    const newStats = await client.hgetall('hash:stats');
    const newCount = parseInt(newStats.count || '0', 10);

    console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('✅ Duplicate removal complete!');
    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
    console.log(`Documents deleted: ${deleted}`);
    console.log(`Errors: ${errors}`);
    console.log(`Previous document count: ${totalCount}`);
    console.log(`New document count: ${newCount}`);
    console.log('');

    await client.quit();
  } catch (error) {
    console.error('\n❌ Error:', error instanceof Error ? error.message : error);
    process.exit(1);
  }
}

// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);

if (parsedArgs.showHelp) {
  showHelp();
}

// Validate field if provided
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
  console.error(`❌ Invalid field: ${parsedArgs.field}`);
  console.error(`   Valid fields: ${validFields.join(', ')}`);
  process.exit(1);
}

console.log(`\n🔧 Configuration:`);
console.log(`   Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
if (parsedArgs.field) {
  console.log(`   Field: ${parsedArgs.field}`);
} else {
  console.log(`   Fields: all (md5, sha1, sha256, sha512)`);
}
console.log('');

removeDuplicates(parsedArgs).catch(console.error);