Comparar commits

1 Commits

Autor SHA1 Mensaje Fecha
ale
b91d19dc0b fix memory remove dup
Signed-off-by: ale <ale@manalejandro.com>
2025-12-21 22:36:31 +01:00

Ver fichero

@@ -225,50 +225,166 @@ async function findDuplicatesForField(
return duplicates; return duplicates;
} }
async function removeDuplicates(parsedArgs: ParsedArgs) { /**
* Phase 1: Initialize and connect to Elasticsearch
*/
async function phase1_InitAndConnect() {
console.log(`🔍 Hasher Duplicate Remover - Phase 1: Initialization`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Index: ${INDEX_NAME}`);
console.log('');
const client = new Client({ node: ELASTICSEARCH_NODE }); const client = new Client({ node: ELASTICSEARCH_NODE });
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('✅ Connected successfully\n');
const countResponse = await client.count({ index: INDEX_NAME });
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
return { client, totalDocuments: countResponse.count };
}
/**
* Phase 2: Find duplicates for a specific field
*/
async function phase2_FindDuplicatesForField(
client: Client,
field: string,
batchSize: number,
seenDeleteIds: Set<string>
): Promise<{ duplicates: DuplicateGroup[], totalFound: number }> {
console.log(`\n🔍 Phase 2: Checking duplicates for field: ${field}...`);
const fieldDuplicates = await findDuplicatesForField(client, field, batchSize);
const duplicates: DuplicateGroup[] = [];
// Filter out already seen delete IDs to avoid counting the same document multiple times
for (const dup of fieldDuplicates) {
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
if (newDeleteIds.length > 0) {
dup.deleteIds = newDeleteIds;
newDeleteIds.forEach(id => seenDeleteIds.add(id));
duplicates.push(dup);
}
}
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
console.log(` New unique documents to delete: ${duplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0)}`);
// Force garbage collection if available
if (global.gc) {
global.gc();
console.log(` ♻️ Memory freed after processing ${field}`);
}
return { duplicates, totalFound: fieldDuplicates.length };
}
/**
* Phase 3: Process deletion for a batch of duplicates
*/
async function phase3_DeleteBatch(
client: Client,
deleteIds: string[],
batchSize: number,
startIndex: number
): Promise<{ deleted: number, errors: number }> {
const batch = deleteIds.slice(startIndex, startIndex + batchSize);
let deleted = 0;
let errors = 0;
try {
const bulkOperations = batch.flatMap(id => [
{ delete: { _index: INDEX_NAME, _id: id } }
]);
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
errors += errorCount;
deleted += batch.length - errorCount;
} else {
deleted += batch.length;
}
} catch (error) {
console.error(`\n❌ Error deleting batch:`, error);
errors += batch.length;
}
// Force garbage collection if available
if (global.gc) {
global.gc();
}
return { deleted, errors };
}
/**
* Phase 4: Finalize and report results
*/
async function phase4_Finalize(
client: Client,
totalDeleted: number,
totalErrors: number,
initialDocumentCount: number
) {
console.log('\n\n🔄 Phase 4: Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
const newCountResponse = await client.count({ index: INDEX_NAME });
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${totalDeleted}`);
console.log(`Errors: ${totalErrors}`);
console.log(`Previous document count: ${initialDocumentCount}`);
console.log(`New document count: ${newCountResponse.count}`);
console.log('');
}
async function removeDuplicates(parsedArgs: ParsedArgs) {
const fields = parsedArgs.field const fields = parsedArgs.field
? [parsedArgs.field] ? [parsedArgs.field]
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']; : ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
console.log(`🔍 Hasher Duplicate Remover`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`); console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
console.log(`Batch size: ${parsedArgs.batchSize}`); console.log(`Batch size: ${parsedArgs.batchSize}`);
console.log(`Fields to check: ${fields.join(', ')}`); console.log(`Fields to check: ${fields.join(', ')}`);
console.log(''); console.log('');
try { try {
// Test connection // === PHASE 1: Initialize ===
console.log('🔗 Connecting to Elasticsearch...'); const { client, totalDocuments } = await phase1_InitAndConnect();
await client.cluster.health({});
console.log('✅ Connected successfully\n');
// Get index stats // Force garbage collection after phase 1
const countResponse = await client.count({ index: INDEX_NAME }); if (global.gc) {
console.log(`📊 Total documents in index: ${countResponse.count}\n`); global.gc();
console.log('♻️ Memory freed after initialization\n');
}
// === PHASE 2: Find duplicates field by field ===
const allDuplicates: DuplicateGroup[] = []; const allDuplicates: DuplicateGroup[] = [];
const seenDeleteIds = new Set<string>(); const seenDeleteIds = new Set<string>();
// Find duplicates for each field
for (const field of fields) { for (const field of fields) {
console.log(`🔍 Checking duplicates for field: ${field}...`); const { duplicates } = await phase2_FindDuplicatesForField(
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize); client,
field,
parsedArgs.batchSize,
seenDeleteIds
);
allDuplicates.push(...duplicates);
// Filter out already seen delete IDs to avoid counting the same document multiple times // Clear field duplicates to free memory
for (const dup of fieldDuplicates) { duplicates.length = 0;
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
if (newDeleteIds.length > 0) {
dup.deleteIds = newDeleteIds;
newDeleteIds.forEach(id => seenDeleteIds.add(id));
allDuplicates.push(dup);
}
}
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
} }
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0); const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
@@ -310,57 +426,40 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
return; return;
} }
// Execute deletion // === PHASE 3: Execute deletion in batches ===
console.log(`\n🗑 Removing ${totalToDelete} duplicate documents...\n`); console.log(`\n🗑 Phase 3: Removing ${totalToDelete} duplicate documents...\n`);
let deleted = 0; let totalDeleted = 0;
let errors = 0; let totalErrors = 0;
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds); const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
// Delete in batches // Clear allDuplicates to free memory
allDuplicates.length = 0;
// Delete in batches with memory management
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) { for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
const batch = deleteIds.slice(i, i + parsedArgs.batchSize); const { deleted, errors } = await phase3_DeleteBatch(
client,
deleteIds,
parsedArgs.batchSize,
i
);
try { totalDeleted += deleted;
const bulkOperations = batch.flatMap(id => [ totalErrors += errors;
{ delete: { _index: INDEX_NAME, _id: id } }
]);
const bulkResponse = await client.bulk({ process.stdout.write(
operations: bulkOperations, `\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - ` +
refresh: false `Deleted: ${totalDeleted}, Errors: ${totalErrors}`
}); );
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
errors += errorCount;
deleted += batch.length - errorCount;
} else {
deleted += batch.length;
}
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
} catch (error) {
console.error(`\n❌ Error deleting batch:`, error);
errors += batch.length;
}
} }
// Refresh index // Clear deleteIds to free memory
console.log('\n\n🔄 Refreshing index...'); deleteIds.length = 0;
await client.indices.refresh({ index: INDEX_NAME }); seenDeleteIds.clear();
// Get new count // === PHASE 4: Finalize ===
const newCountResponse = await client.count({ index: INDEX_NAME }); await phase4_Finalize(client, totalDeleted, totalErrors, totalDocuments);
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${deleted}`);
console.log(`Errors: ${errors}`);
console.log(`Previous document count: ${countResponse.count}`);
console.log(`New document count: ${newCountResponse.count}`);
console.log('');
} catch (error) { } catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error); console.error('\n❌ Error:', error instanceof Error ? error.message : error);