From 459cdcd9bce5aa9bcc345a7d734df4d3cdf1faa2 Mon Sep 17 00:00:00 2001 From: ale Date: Mon, 8 Dec 2025 20:56:04 +0100 Subject: [PATCH] remove-duplicates Signed-off-by: ale --- package.json | 3 +- scripts/remove-duplicates.ts | 350 +++++++++++++++++++++++++++++++++++ 2 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 scripts/remove-duplicates.ts diff --git a/package.json b/package.json index a138ff6..6e5d52b 100644 --- a/package.json +++ b/package.json @@ -34,7 +34,8 @@ "build": "next build", "start": "next start", "lint": "eslint", - "index-file": "tsx scripts/index-file.ts" + "index-file": "tsx scripts/index-file.ts", + "remove-duplicates": "tsx scripts/remove-duplicates.ts" }, "dependencies": { "@elastic/elasticsearch": "^9.2.0", diff --git a/scripts/remove-duplicates.ts b/scripts/remove-duplicates.ts new file mode 100644 index 0000000..2f387fd --- /dev/null +++ b/scripts/remove-duplicates.ts @@ -0,0 +1,350 @@ +#!/usr/bin/env node + +/** + * Hasher Duplicate Remover Script + * + * This script finds and removes duplicate entries from the Elasticsearch index. + * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields. + * + * Usage: + * npx tsx scripts/remove-duplicates.ts [options] + * npm run remove-duplicates [-- options] + * + * Options: + * --dry-run Show duplicates without removing them (default) + * --execute Actually remove the duplicates + * --batch-size= Number of items to process in each batch (default: 1000) + * --field= Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512) + * --help, -h Show this help message + */ + +import { Client } from '@elastic/elasticsearch'; + +const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200'; +const INDEX_NAME = 'hasher'; +const DEFAULT_BATCH_SIZE = 1000; + +interface ParsedArgs { + dryRun: boolean; + batchSize: number; + field: string | null; + showHelp: boolean; +} + +interface DuplicateGroup { + value: string; + field: string; + documentIds: string[]; + keepId: string; + deleteIds: string[]; +} + +function parseArgs(args: string[]): ParsedArgs { + const result: ParsedArgs = { + dryRun: true, + batchSize: DEFAULT_BATCH_SIZE, + field: null, + showHelp: false + }; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + + if (arg === '--help' || arg === '-h') { + result.showHelp = true; + } else if (arg === '--dry-run') { + result.dryRun = true; + } else if (arg === '--execute') { + result.dryRun = false; + } else if (arg.startsWith('--batch-size=')) { + const value = arg.split('=')[1]; + const parsed = parseInt(value, 10); + if (!isNaN(parsed) && parsed > 0) { + result.batchSize = parsed; + } + } else if (arg === '--batch-size') { + const nextArg = args[i + 1]; + if (nextArg && !nextArg.startsWith('-')) { + const parsed = parseInt(nextArg, 10); + if (!isNaN(parsed) && parsed > 0) { + result.batchSize = parsed; + i++; + } + } + } else if (arg.startsWith('--field=')) { + result.field = arg.split('=')[1]; + } else if (arg === '--field') { + const nextArg = args[i + 1]; + if (nextArg && !nextArg.startsWith('-')) { + result.field = nextArg; + i++; + } + } + } + + return result; +} + +function showHelp() { + console.log(` +Hasher Duplicate Remover Script + +Usage: + npx tsx scripts/remove-duplicates.ts [options] + npm run remove-duplicates [-- options] + +Options: + --dry-run Show duplicates without removing them (default) + --execute Actually remove the duplicates + --batch-size= Number of items to process in each batch (default: 1000) + --field= Check duplicates only on this field + Valid fields: plaintext, md5, sha1, sha256, sha512 + --help, -h Show this help message + +Environment Variables: + ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200) + +Examples: + npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates + npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates + npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates + npx tsx scripts/remove-duplicates.ts --execute --field=plaintext + +Notes: + - The script keeps the OLDEST document (by created_at) and removes newer duplicates + - Always run with --dry-run first to review what will be deleted + - Duplicates are checked across all hash fields by default +`); + process.exit(0); +} + +async function findDuplicatesForField( + client: Client, + field: string, + batchSize: number +): Promise { + const duplicates: DuplicateGroup[] = []; + + // Use aggregation to find duplicate values + const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field; + + const response = await client.search({ + index: INDEX_NAME, + size: 0, + aggs: { + duplicates: { + terms: { + field: fieldToAggregate, + min_doc_count: 2, + size: batchSize + } + } + } + }); + + const buckets = (response.aggregations?.duplicates as any)?.buckets || []; + + for (const bucket of buckets) { + const value = bucket.key; + + // Get all documents with this value, sorted by created_at + const docsResponse = await client.search({ + index: INDEX_NAME, + size: bucket.doc_count, + query: { + term: { + [fieldToAggregate]: value + } + }, + sort: [ + { created_at: { order: 'asc' } } + ], + _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512', 'created_at'] + }); + + const documentIds = docsResponse.hits.hits.map((hit: any) => hit._id); + + if (documentIds.length > 1) { + duplicates.push({ + value: String(value), + field, + documentIds, + keepId: documentIds[0], // Keep the oldest + deleteIds: documentIds.slice(1) // Delete the rest + }); + } + } + + return duplicates; +} + +async function removeDuplicates(parsedArgs: ParsedArgs) { + const client = new Client({ node: ELASTICSEARCH_NODE }); + const fields = parsedArgs.field + ? [parsedArgs.field] + : ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']; + + console.log(`🔍 Hasher Duplicate Remover`); + console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); + console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`); + console.log(`Index: ${INDEX_NAME}`); + console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`); + console.log(`Batch size: ${parsedArgs.batchSize}`); + console.log(`Fields to check: ${fields.join(', ')}`); + console.log(''); + + try { + // Test connection + console.log('🔗 Connecting to Elasticsearch...'); + await client.cluster.health({}); + console.log('✅ Connected successfully\n'); + + // Get index stats + const countResponse = await client.count({ index: INDEX_NAME }); + console.log(`📊 Total documents in index: ${countResponse.count}\n`); + + const allDuplicates: DuplicateGroup[] = []; + const seenDeleteIds = new Set(); + + // Find duplicates for each field + for (const field of fields) { + console.log(`🔍 Checking duplicates for field: ${field}...`); + const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize); + + // Filter out already seen delete IDs to avoid counting the same document multiple times + for (const dup of fieldDuplicates) { + const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id)); + if (newDeleteIds.length > 0) { + dup.deleteIds = newDeleteIds; + newDeleteIds.forEach(id => seenDeleteIds.add(id)); + allDuplicates.push(dup); + } + } + + console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`); + } + + const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0); + + console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); + console.log(`📋 Summary:`); + console.log(` Duplicate groups found: ${allDuplicates.length}`); + console.log(` Documents to delete: ${totalToDelete}`); + console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`); + + if (allDuplicates.length === 0) { + console.log('✨ No duplicates found! Index is clean.\n'); + return; + } + + // Show sample of duplicates + console.log(`📝 Sample duplicates (showing first 10):\n`); + const samplesToShow = allDuplicates.slice(0, 10); + for (const dup of samplesToShow) { + const truncatedValue = dup.value.length > 50 + ? dup.value.substring(0, 50) + '...' + : dup.value; + console.log(` Field: ${dup.field}`); + console.log(` Value: ${truncatedValue}`); + console.log(` Keep: ${dup.keepId}`); + console.log(` Delete: ${dup.deleteIds.length} document(s)`); + console.log(''); + } + + if (allDuplicates.length > 10) { + console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`); + } + + if (parsedArgs.dryRun) { + console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); + console.log(`🔎 DRY RUN - No changes made`); + console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`); + console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`); + return; + } + + // Execute deletion + console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`); + + let deleted = 0; + let errors = 0; + const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds); + + // Delete in batches + for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) { + const batch = deleteIds.slice(i, i + parsedArgs.batchSize); + + try { + const bulkOperations = batch.flatMap(id => [ + { delete: { _index: INDEX_NAME, _id: id } } + ]); + + const bulkResponse = await client.bulk({ + operations: bulkOperations, + refresh: false + }); + + if (bulkResponse.errors) { + const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length; + errors += errorCount; + deleted += batch.length - errorCount; + } else { + deleted += batch.length; + } + + process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`); + } catch (error) { + console.error(`\n❌ Error deleting batch:`, error); + errors += batch.length; + } + } + + // Refresh index + console.log('\n\n🔄 Refreshing index...'); + await client.indices.refresh({ index: INDEX_NAME }); + + // Get new count + const newCountResponse = await client.count({ index: INDEX_NAME }); + + console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); + console.log('✅ Duplicate removal complete!'); + console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); + console.log(`Documents deleted: ${deleted}`); + console.log(`Errors: ${errors}`); + console.log(`Previous document count: ${countResponse.count}`); + console.log(`New document count: ${newCountResponse.count}`); + console.log(''); + + } catch (error) { + console.error('\n❌ Error:', error instanceof Error ? error.message : error); + process.exit(1); + } +} + +// Parse command line arguments +const args = process.argv.slice(2); +const parsedArgs = parseArgs(args); + +if (parsedArgs.showHelp) { + showHelp(); +} + +// Validate field if provided +const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']; +if (parsedArgs.field && !validFields.includes(parsedArgs.field)) { + console.error(`❌ Invalid field: ${parsedArgs.field}`); + console.error(` Valid fields: ${validFields.join(', ')}`); + process.exit(1); +} + +console.log(`\n🔧 Configuration:`); +console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`); +console.log(` Batch size: ${parsedArgs.batchSize}`); +if (parsedArgs.field) { + console.log(` Field: ${parsedArgs.field}`); +} else { + console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`); +} +console.log(''); + +removeDuplicates(parsedArgs).catch(console.error);