Signed-off-by: ale <ale@manalejandro.com>
Este commit está contenido en:
ale
2025-12-08 20:58:02 +01:00
padre 459cdcd9bc
commit ad7a1cf0a7

Ver fichero

@@ -128,29 +128,52 @@ async function findDuplicatesForField(
// Use aggregation to find duplicate values
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
const response = await client.search({
// Use composite aggregation to handle large number of duplicates
let afterKey: any = undefined;
let hasMore = true;
console.log(` Scanning for duplicates...`);
while (hasMore) {
const aggQuery: any = {
index: INDEX_NAME,
size: 0,
aggs: {
duplicates: {
terms: {
field: fieldToAggregate,
min_doc_count: 2,
size: batchSize
composite: {
size: batchSize,
sources: [
{ value: { terms: { field: fieldToAggregate } } }
],
...(afterKey && { after: afterKey })
},
aggs: {
doc_count_filter: {
bucket_selector: {
buckets_path: { count: '_count' },
script: 'params.count > 1'
}
}
}
});
}
}
};
const buckets = (response.aggregations?.duplicates as any)?.buckets || [];
const response = await client.search(aggQuery);
const compositeAgg = response.aggregations?.duplicates as any;
const buckets = compositeAgg?.buckets || [];
for (const bucket of buckets) {
const value = bucket.key;
if (bucket.doc_count > 1) {
const value = bucket.key.value;
// Get all documents with this value, sorted by created_at
const docsResponse = await client.search({
// Use scroll API for large result sets
const documentIds: string[] = [];
let scrollResponse = await client.search({
index: INDEX_NAME,
size: bucket.doc_count,
scroll: '1m',
size: 1000,
query: {
term: {
[fieldToAggregate]: value
@@ -159,10 +182,24 @@ async function findDuplicatesForField(
sort: [
{ created_at: { order: 'asc' } }
],
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512', 'created_at']
_source: false
});
const documentIds = docsResponse.hits.hits.map((hit: any) => hit._id);
while (scrollResponse.hits.hits.length > 0) {
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
if (!scrollResponse._scroll_id) break;
scrollResponse = await client.scroll({
scroll_id: scrollResponse._scroll_id,
scroll: '1m'
});
}
// Clear scroll
if (scrollResponse._scroll_id) {
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
}
if (documentIds.length > 1) {
duplicates.push({
@@ -174,6 +211,16 @@ async function findDuplicatesForField(
});
}
}
}
// Check if there are more results
afterKey = compositeAgg?.after_key;
hasMore = buckets.length === batchSize && afterKey;
if (hasMore) {
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
}
}
return duplicates;
}