@@ -3,7 +3,7 @@
|
||||
/**
|
||||
* Hasher Duplicate Remover Script
|
||||
*
|
||||
* This script finds and removes duplicate entries from the Elasticsearch index.
|
||||
* This script finds and removes duplicate entries from Redis.
|
||||
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
||||
*
|
||||
* Usage:
|
||||
@@ -13,20 +13,20 @@
|
||||
* Options:
|
||||
* --dry-run Show duplicates without removing them (default)
|
||||
* --execute Actually remove the duplicates
|
||||
* --batch-size=<number> Number of items to process in each batch (default: 1000)
|
||||
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
|
||||
* --help, -h Show this help message
|
||||
*/
|
||||
|
||||
import { Client } from '@elastic/elasticsearch';
|
||||
import Redis from 'ioredis';
|
||||
|
||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
||||
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
||||
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
||||
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
||||
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
||||
const INDEX_NAME = 'hasher';
|
||||
const DEFAULT_BATCH_SIZE = 1000;
|
||||
|
||||
interface ParsedArgs {
|
||||
dryRun: boolean;
|
||||
batchSize: number;
|
||||
field: string | null;
|
||||
showHelp: boolean;
|
||||
}
|
||||
@@ -34,15 +34,23 @@ interface ParsedArgs {
|
||||
interface DuplicateGroup {
|
||||
value: string;
|
||||
field: string;
|
||||
documentIds: string[];
|
||||
keepId: string;
|
||||
deleteIds: string[];
|
||||
plaintexts: string[];
|
||||
keepPlaintext: string;
|
||||
deletePlaintexts: string[];
|
||||
}
|
||||
|
||||
interface HashDocument {
|
||||
plaintext: string;
|
||||
md5: string;
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
function parseArgs(args: string[]): ParsedArgs {
|
||||
const result: ParsedArgs = {
|
||||
dryRun: true,
|
||||
batchSize: DEFAULT_BATCH_SIZE,
|
||||
field: null,
|
||||
showHelp: false
|
||||
};
|
||||
@@ -56,21 +64,6 @@ function parseArgs(args: string[]): ParsedArgs {
|
||||
result.dryRun = true;
|
||||
} else if (arg === '--execute') {
|
||||
result.dryRun = false;
|
||||
} else if (arg.startsWith('--batch-size=')) {
|
||||
const value = arg.split('=')[1];
|
||||
const parsed = parseInt(value, 10);
|
||||
if (!isNaN(parsed) && parsed > 0) {
|
||||
result.batchSize = parsed;
|
||||
}
|
||||
} else if (arg === '--batch-size') {
|
||||
const nextArg = args[i + 1];
|
||||
if (nextArg && !nextArg.startsWith('-')) {
|
||||
const parsed = parseInt(nextArg, 10);
|
||||
if (!isNaN(parsed) && parsed > 0) {
|
||||
result.batchSize = parsed;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
} else if (arg.startsWith('--field=')) {
|
||||
result.field = arg.split('=')[1];
|
||||
} else if (arg === '--field') {
|
||||
@@ -96,13 +89,15 @@ Usage:
|
||||
Options:
|
||||
--dry-run Show duplicates without removing them (default)
|
||||
--execute Actually remove the duplicates
|
||||
--batch-size=<number> Number of items to process in each batch (default: 1000)
|
||||
--field=<field> Check duplicates only on this field
|
||||
Valid fields: plaintext, md5, sha1, sha256, sha512
|
||||
--help, -h Show this help message
|
||||
|
||||
Environment Variables:
|
||||
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
|
||||
REDIS_HOST Redis host (default: localhost)
|
||||
REDIS_PORT Redis port (default: 6379)
|
||||
REDIS_PASSWORD Redis password (optional)
|
||||
REDIS_DB Redis database number (default: 0)
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
|
||||
@@ -119,106 +114,78 @@ Notes:
|
||||
}
|
||||
|
||||
async function findDuplicatesForField(
|
||||
client: Client,
|
||||
field: string,
|
||||
batchSize: number
|
||||
client: Redis,
|
||||
field: string
|
||||
): Promise<DuplicateGroup[]> {
|
||||
const duplicates: DuplicateGroup[] = [];
|
||||
|
||||
// Use aggregation to find duplicate values
|
||||
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
|
||||
console.log(` Scanning for ${field} duplicates...`);
|
||||
|
||||
// Use composite aggregation to handle large number of duplicates
|
||||
let afterKey: any = undefined;
|
||||
let hasMore = true;
|
||||
// Get all keys for this field type
|
||||
const pattern = field === 'plaintext'
|
||||
? 'hash:plaintext:*'
|
||||
: `hash:index:${field}:*`;
|
||||
|
||||
console.log(` Scanning for duplicates...`);
|
||||
const keys = await client.keys(pattern);
|
||||
|
||||
while (hasMore) {
|
||||
const aggQuery: any = {
|
||||
index: INDEX_NAME,
|
||||
size: 0,
|
||||
aggs: {
|
||||
duplicates: {
|
||||
composite: {
|
||||
size: batchSize,
|
||||
sources: [
|
||||
{ value: { terms: { field: fieldToAggregate } } }
|
||||
],
|
||||
...(afterKey && { after: afterKey })
|
||||
},
|
||||
aggs: {
|
||||
doc_count_filter: {
|
||||
bucket_selector: {
|
||||
buckets_path: { count: '_count' },
|
||||
script: 'params.count > 1'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// For hash indexes, group by hash value (not plaintext)
|
||||
const valueMap = new Map<string, string[]>();
|
||||
|
||||
if (field === 'plaintext') {
|
||||
// Each key is already unique for plaintext
|
||||
// Check for same plaintext with different created_at
|
||||
for (const key of keys) {
|
||||
const plaintext = key.replace('hash:plaintext:', '');
|
||||
if (!valueMap.has(plaintext)) {
|
||||
valueMap.set(plaintext, []);
|
||||
}
|
||||
};
|
||||
|
||||
const response = await client.search(aggQuery);
|
||||
const compositeAgg = response.aggregations?.duplicates as any;
|
||||
const buckets = compositeAgg?.buckets || [];
|
||||
|
||||
for (const bucket of buckets) {
|
||||
if (bucket.doc_count > 1) {
|
||||
const value = bucket.key.value;
|
||||
|
||||
// Use scroll API for large result sets
|
||||
const documentIds: string[] = [];
|
||||
|
||||
let scrollResponse = await client.search({
|
||||
index: INDEX_NAME,
|
||||
scroll: '1m',
|
||||
size: 1000,
|
||||
query: {
|
||||
term: {
|
||||
[fieldToAggregate]: value
|
||||
}
|
||||
},
|
||||
sort: [
|
||||
{ created_at: { order: 'asc' } }
|
||||
],
|
||||
_source: false
|
||||
});
|
||||
|
||||
while (scrollResponse.hits.hits.length > 0) {
|
||||
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
|
||||
|
||||
if (!scrollResponse._scroll_id) break;
|
||||
|
||||
scrollResponse = await client.scroll({
|
||||
scroll_id: scrollResponse._scroll_id,
|
||||
scroll: '1m'
|
||||
});
|
||||
}
|
||||
|
||||
// Clear scroll
|
||||
if (scrollResponse._scroll_id) {
|
||||
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
|
||||
}
|
||||
|
||||
if (documentIds.length > 1) {
|
||||
duplicates.push({
|
||||
value: String(value),
|
||||
field,
|
||||
documentIds,
|
||||
keepId: documentIds[0], // Keep the oldest
|
||||
deleteIds: documentIds.slice(1) // Delete the rest
|
||||
});
|
||||
valueMap.get(plaintext)!.push(plaintext);
|
||||
}
|
||||
} else {
|
||||
// For hash fields, get the plaintext and check if multiple plaintexts have same hash
|
||||
for (const key of keys) {
|
||||
const hashValue = key.replace(`hash:index:${field}:`, '');
|
||||
const plaintext = await client.get(key);
|
||||
|
||||
if (plaintext) {
|
||||
if (!valueMap.has(hashValue)) {
|
||||
valueMap.set(hashValue, []);
|
||||
}
|
||||
valueMap.get(hashValue)!.push(plaintext);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if there are more results
|
||||
afterKey = compositeAgg?.after_key;
|
||||
hasMore = buckets.length === batchSize && afterKey;
|
||||
}
|
||||
|
||||
// Find groups with duplicates
|
||||
for (const [value, plaintexts] of valueMap) {
|
||||
const uniquePlaintexts = Array.from(new Set(plaintexts));
|
||||
|
||||
if (hasMore) {
|
||||
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
|
||||
if (uniquePlaintexts.length > 1) {
|
||||
// Get documents to compare timestamps
|
||||
const docs: { plaintext: string; doc: HashDocument }[] = [];
|
||||
|
||||
for (const plaintext of uniquePlaintexts) {
|
||||
const docKey = `hash:plaintext:${plaintext}`;
|
||||
const docData = await client.get(docKey);
|
||||
if (docData) {
|
||||
docs.push({ plaintext, doc: JSON.parse(docData) });
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by created_at (oldest first)
|
||||
docs.sort((a, b) =>
|
||||
new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
|
||||
);
|
||||
|
||||
if (docs.length > 1) {
|
||||
duplicates.push({
|
||||
value,
|
||||
field,
|
||||
plaintexts: docs.map(d => d.plaintext),
|
||||
keepPlaintext: docs[0].plaintext,
|
||||
deletePlaintexts: docs.slice(1).map(d => d.plaintext)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,44 +193,50 @@ async function findDuplicatesForField(
|
||||
}
|
||||
|
||||
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||||
const client = new Redis({
|
||||
host: REDIS_HOST,
|
||||
port: REDIS_PORT,
|
||||
password: REDIS_PASSWORD,
|
||||
db: REDIS_DB,
|
||||
});
|
||||
|
||||
const fields = parsedArgs.field
|
||||
? [parsedArgs.field]
|
||||
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
||||
: ['md5', 'sha1', 'sha256', 'sha512'];
|
||||
|
||||
console.log(`🔍 Hasher Duplicate Remover`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
||||
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
|
||||
console.log(`Index: ${INDEX_NAME}`);
|
||||
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
||||
console.log(`Batch size: ${parsedArgs.batchSize}`);
|
||||
console.log(`Fields to check: ${fields.join(', ')}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// Test connection
|
||||
console.log('🔗 Connecting to Elasticsearch...');
|
||||
await client.cluster.health({});
|
||||
console.log('🔗 Connecting to Redis...');
|
||||
await client.ping();
|
||||
console.log('✅ Connected successfully\n');
|
||||
|
||||
// Get index stats
|
||||
const countResponse = await client.count({ index: INDEX_NAME });
|
||||
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
|
||||
const stats = await client.hgetall('hash:stats');
|
||||
const totalCount = parseInt(stats.count || '0', 10);
|
||||
console.log(`📊 Total documents in index: ${totalCount}\n`);
|
||||
|
||||
const allDuplicates: DuplicateGroup[] = [];
|
||||
const seenDeleteIds = new Set<string>();
|
||||
const seenPlaintexts = new Set<string>();
|
||||
|
||||
// Find duplicates for each field
|
||||
for (const field of fields) {
|
||||
console.log(`🔍 Checking duplicates for field: ${field}...`);
|
||||
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
|
||||
const fieldDuplicates = await findDuplicatesForField(client, field);
|
||||
|
||||
// Filter out already seen delete IDs to avoid counting the same document multiple times
|
||||
// Filter out already seen plaintexts
|
||||
for (const dup of fieldDuplicates) {
|
||||
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
|
||||
if (newDeleteIds.length > 0) {
|
||||
dup.deleteIds = newDeleteIds;
|
||||
newDeleteIds.forEach(id => seenDeleteIds.add(id));
|
||||
const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
|
||||
if (newDeletePlaintexts.length > 0) {
|
||||
dup.deletePlaintexts = newDeletePlaintexts;
|
||||
newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
|
||||
allDuplicates.push(dup);
|
||||
}
|
||||
}
|
||||
@@ -271,7 +244,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
||||
}
|
||||
|
||||
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
|
||||
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);
|
||||
|
||||
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`📋 Summary:`);
|
||||
@@ -281,6 +254,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
|
||||
if (allDuplicates.length === 0) {
|
||||
console.log('✨ No duplicates found! Index is clean.\n');
|
||||
await client.quit();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -293,8 +267,8 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
: dup.value;
|
||||
console.log(` Field: ${dup.field}`);
|
||||
console.log(` Value: ${truncatedValue}`);
|
||||
console.log(` Keep: ${dup.keepId}`);
|
||||
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
|
||||
console.log(` Keep: ${dup.keepPlaintext}`);
|
||||
console.log(` Delete: ${dup.deletePlaintexts.length} document(s)`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
@@ -307,6 +281,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
console.log(`🔎 DRY RUN - No changes made`);
|
||||
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
||||
await client.quit();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -315,53 +290,61 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
|
||||
let deleted = 0;
|
||||
let errors = 0;
|
||||
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
|
||||
|
||||
// Delete in batches
|
||||
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
|
||||
const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
|
||||
|
||||
try {
|
||||
const bulkOperations = batch.flatMap(id => [
|
||||
{ delete: { _index: INDEX_NAME, _id: id } }
|
||||
]);
|
||||
|
||||
const bulkResponse = await client.bulk({
|
||||
operations: bulkOperations,
|
||||
refresh: false
|
||||
});
|
||||
|
||||
if (bulkResponse.errors) {
|
||||
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
|
||||
errors += errorCount;
|
||||
deleted += batch.length - errorCount;
|
||||
} else {
|
||||
deleted += batch.length;
|
||||
for (const dup of allDuplicates) {
|
||||
for (const plaintext of dup.deletePlaintexts) {
|
||||
try {
|
||||
const docKey = `hash:plaintext:${plaintext}`;
|
||||
const docData = await client.get(docKey);
|
||||
|
||||
if (docData) {
|
||||
const doc: HashDocument = JSON.parse(docData);
|
||||
const pipeline = client.pipeline();
|
||||
|
||||
// Delete main document
|
||||
pipeline.del(docKey);
|
||||
|
||||
// Delete all indexes
|
||||
pipeline.del(`hash:index:md5:${doc.md5}`);
|
||||
pipeline.del(`hash:index:sha1:${doc.sha1}`);
|
||||
pipeline.del(`hash:index:sha256:${doc.sha256}`);
|
||||
pipeline.del(`hash:index:sha512:${doc.sha512}`);
|
||||
|
||||
// Update statistics
|
||||
pipeline.hincrby('hash:stats', 'count', -1);
|
||||
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
|
||||
|
||||
const results = await pipeline.exec();
|
||||
|
||||
if (results && results.some(([err]) => err !== null)) {
|
||||
errors++;
|
||||
} else {
|
||||
deleted++;
|
||||
}
|
||||
}
|
||||
|
||||
process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
|
||||
} catch (error) {
|
||||
console.error(`\n❌ Error deleting ${plaintext}:`, error);
|
||||
errors++;
|
||||
}
|
||||
|
||||
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
|
||||
} catch (error) {
|
||||
console.error(`\n❌ Error deleting batch:`, error);
|
||||
errors += batch.length;
|
||||
}
|
||||
}
|
||||
|
||||
// Refresh index
|
||||
console.log('\n\n🔄 Refreshing index...');
|
||||
await client.indices.refresh({ index: INDEX_NAME });
|
||||
|
||||
// Get new count
|
||||
const newCountResponse = await client.count({ index: INDEX_NAME });
|
||||
const newStats = await client.hgetall('hash:stats');
|
||||
const newCount = parseInt(newStats.count || '0', 10);
|
||||
|
||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log('✅ Duplicate removal complete!');
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Documents deleted: ${deleted}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
console.log(`Previous document count: ${countResponse.count}`);
|
||||
console.log(`New document count: ${newCountResponse.count}`);
|
||||
console.log(`Previous document count: ${totalCount}`);
|
||||
console.log(`New document count: ${newCount}`);
|
||||
console.log('');
|
||||
|
||||
await client.quit();
|
||||
} catch (error) {
|
||||
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
@@ -386,11 +369,10 @@ if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
|
||||
|
||||
console.log(`\n🔧 Configuration:`);
|
||||
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
|
||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
||||
if (parsedArgs.field) {
|
||||
console.log(` Field: ${parsedArgs.field}`);
|
||||
} else {
|
||||
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
|
||||
console.log(` Fields: all (md5, sha1, sha256, sha512)`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
|
||||
Referencia en una nueva incidencia
Block a user