new redis migration

Signed-off-by: ale <ale@manalejandro.com>
2025-12-15 16:35:35 +01:00
commit 3ce64eeb8e
--- a/scripts/remove-duplicates.ts
+++ b/scripts/remove-duplicates.ts
@@ -3,7 +3,7 @@
 /**
 * Hasher Duplicate Remover Script
 * 
- * This script finds and removes duplicate entries from the Elasticsearch index.
+ * This script finds and removes duplicate entries from Redis.
 * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
 * 
 * Usage:
@@ -13,20 +13,20 @@
 * Options:
 *   --dry-run              Show duplicates without removing them (default)
 *   --execute              Actually remove the duplicates
- *   --batch-size=<number>  Number of items to process in each batch (default: 1000)
 *   --field=<field>        Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
 *   --help, -h             Show this help message
 */

-import { Client } from '@elastic/elasticsearch';
+import Redis from 'ioredis';

-const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
+const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
+const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
+const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
 const INDEX_NAME = 'hasher';
-const DEFAULT_BATCH_SIZE = 1000;

 interface ParsedArgs {
  dryRun: boolean;
-  batchSize: number;
  field: string | null;
  showHelp: boolean;
 }
@@ -34,15 +34,23 @@ interface ParsedArgs {
 interface DuplicateGroup {
  value: string;
  field: string;
-  documentIds: string[];
-  keepId: string;
-  deleteIds: string[];
+  plaintexts: string[];
+  keepPlaintext: string;
+  deletePlaintexts: string[];
+}
+
+interface HashDocument {
+  plaintext: string;
+  md5: string;
+  sha1: string;
+  sha256: string;
+  sha512: string;
+  created_at: string;
 }

 function parseArgs(args: string[]): ParsedArgs {
  const result: ParsedArgs = {
    dryRun: true,
-    batchSize: DEFAULT_BATCH_SIZE,
    field: null,
    showHelp: false
  };
@@ -56,21 +64,6 @@ function parseArgs(args: string[]): ParsedArgs {
      result.dryRun = true;
    } else if (arg === '--execute') {
      result.dryRun = false;
-    } else if (arg.startsWith('--batch-size=')) {
-      const value = arg.split('=')[1];
-      const parsed = parseInt(value, 10);
-      if (!isNaN(parsed) && parsed > 0) {
-        result.batchSize = parsed;
-      }
-    } else if (arg === '--batch-size') {
-      const nextArg = args[i + 1];
-      if (nextArg && !nextArg.startsWith('-')) {
-        const parsed = parseInt(nextArg, 10);
-        if (!isNaN(parsed) && parsed > 0) {
-          result.batchSize = parsed;
-          i++;
-        }
-      }
    } else if (arg.startsWith('--field=')) {
      result.field = arg.split('=')[1];
    } else if (arg === '--field') {
@@ -96,13 +89,15 @@ Usage:
 Options:
  --dry-run              Show duplicates without removing them (default)
  --execute              Actually remove the duplicates
-  --batch-size=<number>  Number of items to process in each batch (default: 1000)
  --field=<field>        Check duplicates only on this field
                         Valid fields: plaintext, md5, sha1, sha256, sha512
  --help, -h             Show this help message

 Environment Variables:
-  ELASTICSEARCH_NODE     Elasticsearch node URL (default: http://localhost:9200)
+  REDIS_HOST             Redis host (default: localhost)
+  REDIS_PORT             Redis port (default: 6379)
+  REDIS_PASSWORD         Redis password (optional)
+  REDIS_DB               Redis database number (default: 0)

 Examples:
  npx tsx scripts/remove-duplicates.ts                    # Dry run, show all duplicates
@@ -119,106 +114,78 @@ Notes:
 }

 async function findDuplicatesForField(
-  client: Client, 
-  field: string, 
-  batchSize: number
+  client: Redis, 
+  field: string
 ): Promise<DuplicateGroup[]> {
  const duplicates: DuplicateGroup[] = [];
  
-  // Use aggregation to find duplicate values
-  const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
+  console.log(`   Scanning for ${field} duplicates...`);
  
-  // Use composite aggregation to handle large number of duplicates
-  let afterKey: any = undefined;
-  let hasMore = true;
+  // Get all keys for this field type
+  const pattern = field === 'plaintext' 
+    ? 'hash:plaintext:*'
+    : `hash:index:${field}:*`;
  
-  console.log(`   Scanning for duplicates...`);
+  const keys = await client.keys(pattern);
  
-  while (hasMore) {
-    const aggQuery: any = {
-      index: INDEX_NAME,
-      size: 0,
-      aggs: {
-        duplicates: {
-          composite: {
-            size: batchSize,
-            sources: [
-              { value: { terms: { field: fieldToAggregate } } }
-            ],
-            ...(afterKey && { after: afterKey })
-          },
-          aggs: {
-            doc_count_filter: {
-              bucket_selector: {
-                buckets_path: { count: '_count' },
-                script: 'params.count > 1'
-              }
-            }
-          }
-        }
+  // For hash indexes, group by hash value (not plaintext)
+  const valueMap = new Map<string, string[]>();
+  
+  if (field === 'plaintext') {
+    // Each key is already unique for plaintext
+    // Check for same plaintext with different created_at
+    for (const key of keys) {
+      const plaintext = key.replace('hash:plaintext:', '');
+      if (!valueMap.has(plaintext)) {
+        valueMap.set(plaintext, []);
      }
-    };
-
-    const response = await client.search(aggQuery);
-    const compositeAgg = response.aggregations?.duplicates as any;
-    const buckets = compositeAgg?.buckets || [];
-
-    for (const bucket of buckets) {
-      if (bucket.doc_count > 1) {
-        const value = bucket.key.value;
-        
-        // Use scroll API for large result sets
-        const documentIds: string[] = [];
-        
-        let scrollResponse = await client.search({
-          index: INDEX_NAME,
-          scroll: '1m',
-          size: 1000,
-          query: {
-            term: {
-              [fieldToAggregate]: value
-            }
-          },
-          sort: [
-            { created_at: { order: 'asc' } }
-          ],
-          _source: false
-        });
-
-        while (scrollResponse.hits.hits.length > 0) {
-          documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
-          
-          if (!scrollResponse._scroll_id) break;
-          
-          scrollResponse = await client.scroll({
-            scroll_id: scrollResponse._scroll_id,
-            scroll: '1m'
-          });
-        }
-
-        // Clear scroll
-        if (scrollResponse._scroll_id) {
-          await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
-        }
-        
-        if (documentIds.length > 1) {
-          duplicates.push({
-            value: String(value),
-            field,
-            documentIds,
-            keepId: documentIds[0], // Keep the oldest
-            deleteIds: documentIds.slice(1) // Delete the rest
-          });
+      valueMap.get(plaintext)!.push(plaintext);
+    }
+  } else {
+    // For hash fields, get the plaintext and check if multiple plaintexts have same hash
+    for (const key of keys) {
+      const hashValue = key.replace(`hash:index:${field}:`, '');
+      const plaintext = await client.get(key);
+      
+      if (plaintext) {
+        if (!valueMap.has(hashValue)) {
+          valueMap.set(hashValue, []);
        }
+        valueMap.get(hashValue)!.push(plaintext);
      }
    }
-
-    // Check if there are more results
-    afterKey = compositeAgg?.after_key;
-    hasMore = buckets.length === batchSize && afterKey;
+  }
+  
+  // Find groups with duplicates
+  for (const [value, plaintexts] of valueMap) {
+    const uniquePlaintexts = Array.from(new Set(plaintexts));
    
-    if (hasMore) {
-      process.stdout.write(`\r   Found ${duplicates.length} duplicate groups so far...`);
+    if (uniquePlaintexts.length > 1) {
+      // Get documents to compare timestamps
+      const docs: { plaintext: string; doc: HashDocument }[] = [];
+      
+      for (const plaintext of uniquePlaintexts) {
+        const docKey = `hash:plaintext:${plaintext}`;
+        const docData = await client.get(docKey);
+        if (docData) {
+          docs.push({ plaintext, doc: JSON.parse(docData) });
+        }
+      }
+      
+      // Sort by created_at (oldest first)
+      docs.sort((a, b) => 
+        new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
+      );
+      
+      if (docs.length > 1) {
+        duplicates.push({
+          value,
+          field,
+          plaintexts: docs.map(d => d.plaintext),
+          keepPlaintext: docs[0].plaintext,
+          deletePlaintexts: docs.slice(1).map(d => d.plaintext)
+        });
+      }
    }
  }

@@ -226,44 +193,50 @@ async function findDuplicatesForField(
 }

 async function removeDuplicates(parsedArgs: ParsedArgs) {
-  const client = new Client({ node: ELASTICSEARCH_NODE });
+  const client = new Redis({
+    host: REDIS_HOST,
+    port: REDIS_PORT,
+    password: REDIS_PASSWORD,
+    db: REDIS_DB,
+  });
+
  const fields = parsedArgs.field 
    ? [parsedArgs.field] 
-    : ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
+    : ['md5', 'sha1', 'sha256', 'sha512'];

  console.log(`🔍 Hasher Duplicate Remover`);
  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
-  console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
+  console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
  console.log(`Index: ${INDEX_NAME}`);
  console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️  EXECUTE (will delete)'}`);
-  console.log(`Batch size: ${parsedArgs.batchSize}`);
  console.log(`Fields to check: ${fields.join(', ')}`);
  console.log('');

  try {
    // Test connection
-    console.log('🔗 Connecting to Elasticsearch...');
-    await client.cluster.health({});
+    console.log('🔗 Connecting to Redis...');
+    await client.ping();
    console.log('✅ Connected successfully\n');

    // Get index stats
-    const countResponse = await client.count({ index: INDEX_NAME });
-    console.log(`📊 Total documents in index: ${countResponse.count}\n`);
+    const stats = await client.hgetall('hash:stats');
+    const totalCount = parseInt(stats.count || '0', 10);
+    console.log(`📊 Total documents in index: ${totalCount}\n`);

    const allDuplicates: DuplicateGroup[] = [];
-    const seenDeleteIds = new Set<string>();
+    const seenPlaintexts = new Set<string>();

    // Find duplicates for each field
    for (const field of fields) {
      console.log(`🔍 Checking duplicates for field: ${field}...`);
-      const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
+      const fieldDuplicates = await findDuplicatesForField(client, field);
      
-      // Filter out already seen delete IDs to avoid counting the same document multiple times
+      // Filter out already seen plaintexts
      for (const dup of fieldDuplicates) {
-        const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
-        if (newDeleteIds.length > 0) {
-          dup.deleteIds = newDeleteIds;
-          newDeleteIds.forEach(id => seenDeleteIds.add(id));
+        const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
+        if (newDeletePlaintexts.length > 0) {
+          dup.deletePlaintexts = newDeletePlaintexts;
+          newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
          allDuplicates.push(dup);
        }
      }
@@ -271,7 +244,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
      console.log(`   Found ${fieldDuplicates.length} duplicate groups for ${field}`);
    }

-    const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
+    const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);

    console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
    console.log(`📋 Summary:`);
@@ -281,6 +254,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {

    if (allDuplicates.length === 0) {
      console.log('✨ No duplicates found! Index is clean.\n');
+      await client.quit();
      return;
    }

@@ -293,8 +267,8 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
        : dup.value;
      console.log(`   Field: ${dup.field}`);
      console.log(`   Value: ${truncatedValue}`);
-      console.log(`   Keep: ${dup.keepId}`);
-      console.log(`   Delete: ${dup.deleteIds.length} document(s)`);
+      console.log(`   Keep: ${dup.keepPlaintext}`);
+      console.log(`   Delete: ${dup.deletePlaintexts.length} document(s)`);
      console.log('');
    }

@@ -307,6 +281,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
      console.log(`🔎 DRY RUN - No changes made`);
      console.log(`   Run with --execute to remove ${totalToDelete} duplicate documents`);
      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+      await client.quit();
      return;
    }

@@ -315,53 +290,61 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {

    let deleted = 0;
    let errors = 0;
-    const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);

-    // Delete in batches
-    for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
-      const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
-      
-      try {
-        const bulkOperations = batch.flatMap(id => [
-          { delete: { _index: INDEX_NAME, _id: id } }
-        ]);
-
-        const bulkResponse = await client.bulk({
-          operations: bulkOperations,
-          refresh: false
-        });
-
-        if (bulkResponse.errors) {
-          const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
-          errors += errorCount;
-          deleted += batch.length - errorCount;
-        } else {
-          deleted += batch.length;
+    for (const dup of allDuplicates) {
+      for (const plaintext of dup.deletePlaintexts) {
+        try {
+          const docKey = `hash:plaintext:${plaintext}`;
+          const docData = await client.get(docKey);
+          
+          if (docData) {
+            const doc: HashDocument = JSON.parse(docData);
+            const pipeline = client.pipeline();
+            
+            // Delete main document
+            pipeline.del(docKey);
+            
+            // Delete all indexes
+            pipeline.del(`hash:index:md5:${doc.md5}`);
+            pipeline.del(`hash:index:sha1:${doc.sha1}`);
+            pipeline.del(`hash:index:sha256:${doc.sha256}`);
+            pipeline.del(`hash:index:sha512:${doc.sha512}`);
+            
+            // Update statistics
+            pipeline.hincrby('hash:stats', 'count', -1);
+            pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
+            
+            const results = await pipeline.exec();
+            
+            if (results && results.some(([err]) => err !== null)) {
+              errors++;
+            } else {
+              deleted++;
+            }
+          }
+          
+          process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
+        } catch (error) {
+          console.error(`\n❌ Error deleting ${plaintext}:`, error);
+          errors++;
        }
-
-        process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
-      } catch (error) {
-        console.error(`\n❌ Error deleting batch:`, error);
-        errors += batch.length;
      }
    }

-    // Refresh index
-    console.log('\n\n🔄 Refreshing index...');
-    await client.indices.refresh({ index: INDEX_NAME });
-
    // Get new count
-    const newCountResponse = await client.count({ index: INDEX_NAME });
+    const newStats = await client.hgetall('hash:stats');
+    const newCount = parseInt(newStats.count || '0', 10);

-    console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
+    console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('✅ Duplicate removal complete!');
    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
    console.log(`Documents deleted: ${deleted}`);
    console.log(`Errors: ${errors}`);
-    console.log(`Previous document count: ${countResponse.count}`);
-    console.log(`New document count: ${newCountResponse.count}`);
+    console.log(`Previous document count: ${totalCount}`);
+    console.log(`New document count: ${newCount}`);
    console.log('');

+    await client.quit();
  } catch (error) {
    console.error('\n❌ Error:', error instanceof Error ? error.message : error);
    process.exit(1);
@@ -386,11 +369,10 @@ if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {

 console.log(`\n🔧 Configuration:`);
 console.log(`   Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
-console.log(`   Batch size: ${parsedArgs.batchSize}`);
 if (parsedArgs.field) {
  console.log(`   Field: ${parsedArgs.field}`);
 } else {
-  console.log(`   Fields: all (plaintext, md5, sha1, sha256, sha512)`);
+  console.log(`   Fields: all (md5, sha1, sha256, sha512)`);
 }
 console.log('');