remove-duplicates

Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 20:56:04 +01:00
commit 459cdcd9bc
--- a/scripts/remove-duplicates.ts
+++ b/scripts/remove-duplicates.ts
@@ -0,0 +1,350 @@
+#!/usr/bin/env node
+
+/**
+ * Hasher Duplicate Remover Script
+ * 
+ * This script finds and removes duplicate entries from the Elasticsearch index.
+ * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
+ * 
+ * Usage:
+ *   npx tsx scripts/remove-duplicates.ts [options]
+ *   npm run remove-duplicates [-- options]
+ * 
+ * Options:
+ *   --dry-run              Show duplicates without removing them (default)
+ *   --execute              Actually remove the duplicates
+ *   --batch-size=<number>  Number of items to process in each batch (default: 1000)
+ *   --field=<field>        Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
+ *   --help, -h             Show this help message
+ */
+
+import { Client } from '@elastic/elasticsearch';
+
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const INDEX_NAME = 'hasher';
+const DEFAULT_BATCH_SIZE = 1000;
+
+interface ParsedArgs {
+  dryRun: boolean;
+  batchSize: number;
+  field: string | null;
+  showHelp: boolean;
+}
+
+interface DuplicateGroup {
+  value: string;
+  field: string;
+  documentIds: string[];
+  keepId: string;
+  deleteIds: string[];
+}
+
+function parseArgs(args: string[]): ParsedArgs {
+  const result: ParsedArgs = {
+    dryRun: true,
+    batchSize: DEFAULT_BATCH_SIZE,
+    field: null,
+    showHelp: false
+  };
+
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+
+    if (arg === '--help' || arg === '-h') {
+      result.showHelp = true;
+    } else if (arg === '--dry-run') {
+      result.dryRun = true;
+    } else if (arg === '--execute') {
+      result.dryRun = false;
+    } else if (arg.startsWith('--batch-size=')) {
+      const value = arg.split('=')[1];
+      const parsed = parseInt(value, 10);
+      if (!isNaN(parsed) && parsed > 0) {
+        result.batchSize = parsed;
+      }
+    } else if (arg === '--batch-size') {
+      const nextArg = args[i + 1];
+      if (nextArg && !nextArg.startsWith('-')) {
+        const parsed = parseInt(nextArg, 10);
+        if (!isNaN(parsed) && parsed > 0) {
+          result.batchSize = parsed;
+          i++;
+        }
+      }
+    } else if (arg.startsWith('--field=')) {
+      result.field = arg.split('=')[1];
+    } else if (arg === '--field') {
+      const nextArg = args[i + 1];
+      if (nextArg && !nextArg.startsWith('-')) {
+        result.field = nextArg;
+        i++;
+      }
+    }
+  }
+
+  return result;
+}
+
+function showHelp() {
+  console.log(`
+Hasher Duplicate Remover Script
+
+Usage:
+  npx tsx scripts/remove-duplicates.ts [options]
+  npm run remove-duplicates [-- options]
+
+Options:
+  --dry-run              Show duplicates without removing them (default)
+  --execute              Actually remove the duplicates
+  --batch-size=<number>  Number of items to process in each batch (default: 1000)
+  --field=<field>        Check duplicates only on this field
+                         Valid fields: plaintext, md5, sha1, sha256, sha512
+  --help, -h             Show this help message
+
+Environment Variables:
+  ELASTICSEARCH_NODE     Elasticsearch node URL (default: http://localhost:9200)
+
+Examples:
+  npx tsx scripts/remove-duplicates.ts                    # Dry run, show all duplicates
+  npx tsx scripts/remove-duplicates.ts --execute          # Remove all duplicates
+  npx tsx scripts/remove-duplicates.ts --field=md5        # Check only md5 duplicates
+  npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
+
+Notes:
+  - The script keeps the OLDEST document (by created_at) and removes newer duplicates
+  - Always run with --dry-run first to review what will be deleted
+  - Duplicates are checked across all hash fields by default
+`);
+  process.exit(0);
+}
+
+async function findDuplicatesForField(
+  client: Client, 
+  field: string, 
+  batchSize: number
+): Promise<DuplicateGroup[]> {
+  const duplicates: DuplicateGroup[] = [];
+  
+  // Use aggregation to find duplicate values
+  const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
+  
+  const response = await client.search({
+    index: INDEX_NAME,
+    size: 0,
+    aggs: {
+      duplicates: {
+        terms: {
+          field: fieldToAggregate,
+          min_doc_count: 2,
+          size: batchSize
+        }
+      }
+    }
+  });
+
+  const buckets = (response.aggregations?.duplicates as any)?.buckets || [];
+
+  for (const bucket of buckets) {
+    const value = bucket.key;
+    
+    // Get all documents with this value, sorted by created_at
+    const docsResponse = await client.search({
+      index: INDEX_NAME,
+      size: bucket.doc_count,
+      query: {
+        term: {
+          [fieldToAggregate]: value
+        }
+      },
+      sort: [
+        { created_at: { order: 'asc' } }
+      ],
+      _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512', 'created_at']
+    });
+
+    const documentIds = docsResponse.hits.hits.map((hit: any) => hit._id);
+    
+    if (documentIds.length > 1) {
+      duplicates.push({
+        value: String(value),
+        field,
+        documentIds,
+        keepId: documentIds[0], // Keep the oldest
+        deleteIds: documentIds.slice(1) // Delete the rest
+      });
+    }
+  }
+
+  return duplicates;
+}
+
+async function removeDuplicates(parsedArgs: ParsedArgs) {
+  const client = new Client({ node: ELASTICSEARCH_NODE });
+  const fields = parsedArgs.field 
+    ? [parsedArgs.field] 
+    : ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
+
+  console.log(`🔍 Hasher Duplicate Remover`);
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+  console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
+  console.log(`Index: ${INDEX_NAME}`);
+  console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️  EXECUTE (will delete)'}`);
+  console.log(`Batch size: ${parsedArgs.batchSize}`);
+  console.log(`Fields to check: ${fields.join(', ')}`);
+  console.log('');
+
+  try {
+    // Test connection
+    console.log('🔗 Connecting to Elasticsearch...');
+    await client.cluster.health({});
+    console.log('✅ Connected successfully\n');
+
+    // Get index stats
+    const countResponse = await client.count({ index: INDEX_NAME });
+    console.log(`📊 Total documents in index: ${countResponse.count}\n`);
+
+    const allDuplicates: DuplicateGroup[] = [];
+    const seenDeleteIds = new Set<string>();
+
+    // Find duplicates for each field
+    for (const field of fields) {
+      console.log(`🔍 Checking duplicates for field: ${field}...`);
+      const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
+      
+      // Filter out already seen delete IDs to avoid counting the same document multiple times
+      for (const dup of fieldDuplicates) {
+        const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
+        if (newDeleteIds.length > 0) {
+          dup.deleteIds = newDeleteIds;
+          newDeleteIds.forEach(id => seenDeleteIds.add(id));
+          allDuplicates.push(dup);
+        }
+      }
+      
+      console.log(`   Found ${fieldDuplicates.length} duplicate groups for ${field}`);
+    }
+
+    const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
+
+    console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+    console.log(`📋 Summary:`);
+    console.log(`   Duplicate groups found: ${allDuplicates.length}`);
+    console.log(`   Documents to delete: ${totalToDelete}`);
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+
+    if (allDuplicates.length === 0) {
+      console.log('✨ No duplicates found! Index is clean.\n');
+      return;
+    }
+
+    // Show sample of duplicates
+    console.log(`📝 Sample duplicates (showing first 10):\n`);
+    const samplesToShow = allDuplicates.slice(0, 10);
+    for (const dup of samplesToShow) {
+      const truncatedValue = dup.value.length > 50 
+        ? dup.value.substring(0, 50) + '...' 
+        : dup.value;
+      console.log(`   Field: ${dup.field}`);
+      console.log(`   Value: ${truncatedValue}`);
+      console.log(`   Keep: ${dup.keepId}`);
+      console.log(`   Delete: ${dup.deleteIds.length} document(s)`);
+      console.log('');
+    }
+
+    if (allDuplicates.length > 10) {
+      console.log(`   ... and ${allDuplicates.length - 10} more duplicate groups\n`);
+    }
+
+    if (parsedArgs.dryRun) {
+      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+      console.log(`🔎 DRY RUN - No changes made`);
+      console.log(`   Run with --execute to remove ${totalToDelete} duplicate documents`);
+      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+      return;
+    }
+
+    // Execute deletion
+    console.log(`\n🗑️  Removing ${totalToDelete} duplicate documents...\n`);
+
+    let deleted = 0;
+    let errors = 0;
+    const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
+
+    // Delete in batches
+    for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
+      const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
+      
+      try {
+        const bulkOperations = batch.flatMap(id => [
+          { delete: { _index: INDEX_NAME, _id: id } }
+        ]);
+
+        const bulkResponse = await client.bulk({
+          operations: bulkOperations,
+          refresh: false
+        });
+
+        if (bulkResponse.errors) {
+          const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
+          errors += errorCount;
+          deleted += batch.length - errorCount;
+        } else {
+          deleted += batch.length;
+        }
+
+        process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
+      } catch (error) {
+        console.error(`\n❌ Error deleting batch:`, error);
+        errors += batch.length;
+      }
+    }
+
+    // Refresh index
+    console.log('\n\n🔄 Refreshing index...');
+    await client.indices.refresh({ index: INDEX_NAME });
+
+    // Get new count
+    const newCountResponse = await client.count({ index: INDEX_NAME });
+
+    console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
+    console.log('✅ Duplicate removal complete!');
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+    console.log(`Documents deleted: ${deleted}`);
+    console.log(`Errors: ${errors}`);
+    console.log(`Previous document count: ${countResponse.count}`);
+    console.log(`New document count: ${newCountResponse.count}`);
+    console.log('');
+
+  } catch (error) {
+    console.error('\n❌ Error:', error instanceof Error ? error.message : error);
+    process.exit(1);
+  }
+}
+
+// Parse command line arguments
+const args = process.argv.slice(2);
+const parsedArgs = parseArgs(args);
+
+if (parsedArgs.showHelp) {
+  showHelp();
+}
+
+// Validate field if provided
+const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
+if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
+  console.error(`❌ Invalid field: ${parsedArgs.field}`);
+  console.error(`   Valid fields: ${validFields.join(', ')}`);
+  process.exit(1);
+}
+
+console.log(`\n🔧 Configuration:`);
+console.log(`   Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
+console.log(`   Batch size: ${parsedArgs.batchSize}`);
+if (parsedArgs.field) {
+  console.log(`   Field: ${parsedArgs.field}`);
+} else {
+  console.log(`   Fields: all (plaintext, md5, sha1, sha256, sha512)`);
+}
+console.log('');
+
+removeDuplicates(parsedArgs).catch(console.error);