remove-duplicates

Signed-off-by: ale <ale@manalejandro.com>
Este commit está contenido en:
ale
2025-12-08 20:56:04 +01:00
padre 9c0c30e846
commit 459cdcd9bc
Se han modificado 2 ficheros con 352 adiciones y 1 borrados

350
scripts/remove-duplicates.ts Archivo normal
Ver fichero

@@ -0,0 +1,350 @@
#!/usr/bin/env node
/**
* Hasher Duplicate Remover Script
*
* This script finds and removes duplicate entries from the Elasticsearch index.
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
*
* Usage:
* npx tsx scripts/remove-duplicates.ts [options]
* npm run remove-duplicates [-- options]
*
* Options:
* --dry-run Show duplicates without removing them (default)
* --execute Actually remove the duplicates
* --batch-size=<number> Number of items to process in each batch (default: 1000)
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
* --help, -h Show this help message
*/
import { Client } from '@elastic/elasticsearch';
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 1000;
interface ParsedArgs {
dryRun: boolean;
batchSize: number;
field: string | null;
showHelp: boolean;
}
interface DuplicateGroup {
value: string;
field: string;
documentIds: string[];
keepId: string;
deleteIds: string[];
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
dryRun: true,
batchSize: DEFAULT_BATCH_SIZE,
field: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--dry-run') {
result.dryRun = true;
} else if (arg === '--execute') {
result.dryRun = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++;
}
}
} else if (arg.startsWith('--field=')) {
result.field = arg.split('=')[1];
} else if (arg === '--field') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.field = nextArg;
i++;
}
}
}
return result;
}
function showHelp() {
console.log(`
Hasher Duplicate Remover Script
Usage:
npx tsx scripts/remove-duplicates.ts [options]
npm run remove-duplicates [-- options]
Options:
--dry-run Show duplicates without removing them (default)
--execute Actually remove the duplicates
--batch-size=<number> Number of items to process in each batch (default: 1000)
--field=<field> Check duplicates only on this field
Valid fields: plaintext, md5, sha1, sha256, sha512
--help, -h Show this help message
Environment Variables:
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
Examples:
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
Notes:
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
- Always run with --dry-run first to review what will be deleted
- Duplicates are checked across all hash fields by default
`);
process.exit(0);
}
async function findDuplicatesForField(
client: Client,
field: string,
batchSize: number
): Promise<DuplicateGroup[]> {
const duplicates: DuplicateGroup[] = [];
// Use aggregation to find duplicate values
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
const response = await client.search({
index: INDEX_NAME,
size: 0,
aggs: {
duplicates: {
terms: {
field: fieldToAggregate,
min_doc_count: 2,
size: batchSize
}
}
}
});
const buckets = (response.aggregations?.duplicates as any)?.buckets || [];
for (const bucket of buckets) {
const value = bucket.key;
// Get all documents with this value, sorted by created_at
const docsResponse = await client.search({
index: INDEX_NAME,
size: bucket.doc_count,
query: {
term: {
[fieldToAggregate]: value
}
},
sort: [
{ created_at: { order: 'asc' } }
],
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512', 'created_at']
});
const documentIds = docsResponse.hits.hits.map((hit: any) => hit._id);
if (documentIds.length > 1) {
duplicates.push({
value: String(value),
field,
documentIds,
keepId: documentIds[0], // Keep the oldest
deleteIds: documentIds.slice(1) // Delete the rest
});
}
}
return duplicates;
}
async function removeDuplicates(parsedArgs: ParsedArgs) {
const client = new Client({ node: ELASTICSEARCH_NODE });
const fields = parsedArgs.field
? [parsedArgs.field]
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
console.log(`🔍 Hasher Duplicate Remover`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
console.log(`Batch size: ${parsedArgs.batchSize}`);
console.log(`Fields to check: ${fields.join(', ')}`);
console.log('');
try {
// Test connection
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('✅ Connected successfully\n');
// Get index stats
const countResponse = await client.count({ index: INDEX_NAME });
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
const allDuplicates: DuplicateGroup[] = [];
const seenDeleteIds = new Set<string>();
// Find duplicates for each field
for (const field of fields) {
console.log(`🔍 Checking duplicates for field: ${field}...`);
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
// Filter out already seen delete IDs to avoid counting the same document multiple times
for (const dup of fieldDuplicates) {
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
if (newDeleteIds.length > 0) {
dup.deleteIds = newDeleteIds;
newDeleteIds.forEach(id => seenDeleteIds.add(id));
allDuplicates.push(dup);
}
}
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
}
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`📋 Summary:`);
console.log(` Duplicate groups found: ${allDuplicates.length}`);
console.log(` Documents to delete: ${totalToDelete}`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
if (allDuplicates.length === 0) {
console.log('✨ No duplicates found! Index is clean.\n');
return;
}
// Show sample of duplicates
console.log(`📝 Sample duplicates (showing first 10):\n`);
const samplesToShow = allDuplicates.slice(0, 10);
for (const dup of samplesToShow) {
const truncatedValue = dup.value.length > 50
? dup.value.substring(0, 50) + '...'
: dup.value;
console.log(` Field: ${dup.field}`);
console.log(` Value: ${truncatedValue}`);
console.log(` Keep: ${dup.keepId}`);
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
console.log('');
}
if (allDuplicates.length > 10) {
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
}
if (parsedArgs.dryRun) {
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`🔎 DRY RUN - No changes made`);
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
return;
}
// Execute deletion
console.log(`\n🗑 Removing ${totalToDelete} duplicate documents...\n`);
let deleted = 0;
let errors = 0;
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
// Delete in batches
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
try {
const bulkOperations = batch.flatMap(id => [
{ delete: { _index: INDEX_NAME, _id: id } }
]);
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
errors += errorCount;
deleted += batch.length - errorCount;
} else {
deleted += batch.length;
}
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
} catch (error) {
console.error(`\n❌ Error deleting batch:`, error);
errors += batch.length;
}
}
// Refresh index
console.log('\n\n🔄 Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
// Get new count
const newCountResponse = await client.count({ index: INDEX_NAME });
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${deleted}`);
console.log(`Errors: ${errors}`);
console.log(`Previous document count: ${countResponse.count}`);
console.log(`New document count: ${newCountResponse.count}`);
console.log('');
} catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);
if (parsedArgs.showHelp) {
showHelp();
}
// Validate field if provided
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
console.error(`❌ Invalid field: ${parsedArgs.field}`);
console.error(` Valid fields: ${validFields.join(', ')}`);
process.exit(1);
}
console.log(`\n🔧 Configuration:`);
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
if (parsedArgs.field) {
console.log(` Field: ${parsedArgs.field}`);
} else {
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
}
console.log('');
removeDuplicates(parsedArgs).catch(console.error);