new redis migration

Signed-off-by: ale <ale@manalejandro.com>
Este commit está contenido en:
ale
2025-12-15 16:35:35 +01:00
padre ad7a1cf0a7
commit 3ce64eeb8e
Se han modificado 20 ficheros con 1021 adiciones y 712 borrados

Ver fichero

@@ -4,7 +4,7 @@
* Hasher Indexer Script
*
* This script reads a text file with one word/phrase per line and indexes
* all the generated hashes into Elasticsearch.
* all the generated hashes into Redis.
*
* Usage:
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
@@ -19,13 +19,16 @@
* --help, -h Show this help message
*/
import { Client } from '@elastic/elasticsearch';
import Redis from 'ioredis';
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
import { resolve, basename } from 'path';
import { createInterface } from 'readline';
import crypto from 'crypto';
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 100;
@@ -35,7 +38,6 @@ interface HashDocument {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
created_at: string;
}
@@ -158,16 +160,12 @@ function deleteState(stateFile: string): void {
}
async function generateHashes(plaintext: string): Promise<HashDocument> {
const bcrypt = await import('bcrypt');
const bcryptHash = await bcrypt.default.hash(plaintext, 10);
return {
plaintext,
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
bcrypt: bcryptHash,
created_at: new Date().toISOString()
};
}
@@ -190,7 +188,10 @@ Options:
--help, -h Show this help message
Environment Variables:
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
REDIS_HOST Redis host (default: localhost)
REDIS_PORT Redis port (default: 6379)
REDIS_PASSWORD Redis password (optional)
REDIS_DB Redis database number (default: 0)
Examples:
npx tsx scripts/index-file.ts wordlist.txt
@@ -214,7 +215,14 @@ Duplicate Checking:
}
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
const client = new Client({ node: ELASTICSEARCH_NODE });
const client = new Redis({
host: REDIS_HOST,
port: REDIS_PORT,
password: REDIS_PASSWORD,
db: REDIS_DB,
retryStrategy: (times) => Math.min(times * 50, 2000),
});
const absolutePath = resolve(filePath);
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
const fileHash = getFileHash(absolutePath);
@@ -252,7 +260,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
console.log(`📚 Hasher Indexer`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`File: ${filePath}`);
console.log(`Batch size: ${batchSize}`);
@@ -286,8 +294,8 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
try {
// Test connection
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('🔗 Connecting to Redis...');
await client.ping();
console.log('✅ Connected successfully\n');
// Process file line by line using streams
@@ -310,8 +318,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
if (batch.length === 0) return;
if (isInterrupted) return;
const bulkOperations: any[] = [];
// Generate hashes for all items in batch first
const batchWithHashes = await Promise.all(
batch.map(async (plaintext: string) => ({
@@ -320,92 +326,82 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
}))
);
const pipeline = client.pipeline();
let toIndex: typeof batchWithHashes = [];
if (checkDuplicates) {
// Check which items already exist (by plaintext or any hash)
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
// Check which items already exist
const existenceChecks = await Promise.all(
batchWithHashes.map(async (item) => {
const plaintextExists = await client.exists(`hash:plaintext:${item.plaintext}`);
if (plaintextExists) return { item, exists: true };
// Check if any hash exists
const md5Exists = await client.exists(`hash:index:md5:${item.hashes.md5}`);
const sha1Exists = await client.exists(`hash:index:sha1:${item.hashes.sha1}`);
const sha256Exists = await client.exists(`hash:index:sha256:${item.hashes.sha256}`);
const sha512Exists = await client.exists(`hash:index:sha512:${item.hashes.sha512}`);
return {
item,
exists: md5Exists || sha1Exists || sha256Exists || sha512Exists
};
})
);
const existingCheck = await client.search({
index: INDEX_NAME,
size: batchSize * 5,
query: {
bool: {
should: [
{ terms: { 'plaintext.keyword': batch } },
{ terms: { md5: md5List } },
{ terms: { sha1: sha1List } },
{ terms: { sha256: sha256List } },
{ terms: { sha512: sha512List } },
],
minimum_should_match: 1
}
},
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
});
// Create a set of existing hashes for quick lookup
const existingHashes = new Set<string>();
existingCheck.hits.hits.forEach((hit: any) => {
const src = hit._source;
existingHashes.add(src.plaintext);
existingHashes.add(src.md5);
existingHashes.add(src.sha1);
existingHashes.add(src.sha256);
existingHashes.add(src.sha512);
});
// Prepare bulk operations only for items that don't have any duplicate hash
for (const item of batchWithHashes) {
const isDuplicate =
existingHashes.has(item.plaintext) ||
existingHashes.has(item.hashes.md5) ||
existingHashes.has(item.hashes.sha1) ||
existingHashes.has(item.hashes.sha256) ||
existingHashes.has(item.hashes.sha512);
if (!isDuplicate) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
} else {
for (const check of existenceChecks) {
if (check.exists) {
state.skipped++;
sessionSkipped++;
} else {
toIndex.push(check.item);
}
}
} else {
// No duplicate checking - index everything
for (const item of batchWithHashes) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
}
toIndex = batchWithHashes;
}
// Execute bulk operation only if there are new items to insert
if (bulkOperations.length > 0) {
// Execute bulk operations
if (toIndex.length > 0) {
try {
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
for (const item of toIndex) {
const doc = item.hashes;
const key = `hash:plaintext:${doc.plaintext}`;
// Store main document
pipeline.set(key, JSON.stringify(doc));
// Create indexes for each hash type
pipeline.set(`hash:index:md5:${doc.md5}`, doc.plaintext);
pipeline.set(`hash:index:sha1:${doc.sha1}`, doc.plaintext);
pipeline.set(`hash:index:sha256:${doc.sha256}`, doc.plaintext);
pipeline.set(`hash:index:sha512:${doc.sha512}`, doc.plaintext);
// Update statistics
pipeline.hincrby('hash:stats', 'count', 1);
pipeline.hincrby('hash:stats', 'size', JSON.stringify(doc).length);
}
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
const results = await pipeline.exec();
// Count errors
const errorCount = results?.filter(([err]) => err !== null).length || 0;
if (errorCount > 0) {
state.errors += errorCount;
sessionErrors += errorCount;
const successCount = (bulkOperations.length / 2) - errorCount;
const successCount = toIndex.length - errorCount;
state.indexed += successCount;
sessionIndexed += successCount;
} else {
const count = bulkOperations.length / 2;
state.indexed += count;
sessionIndexed += count;
state.indexed += toIndex.length;
sessionIndexed += toIndex.length;
}
} catch (error) {
console.error(`\n❌ Error processing batch:`, error);
const count = bulkOperations.length / 2;
state.errors += count;
sessionErrors += count;
state.errors += toIndex.length;
sessionErrors += toIndex.length;
}
}
@@ -457,9 +453,8 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
return;
}
// Refresh index
console.log('\n\n🔄 Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
// No refresh needed for Redis
console.log('\n\n✅ All data persisted to Redis');
// Delete state file on successful completion
deleteState(stateFile);

Ver fichero

@@ -3,7 +3,7 @@
/**
* Hasher Duplicate Remover Script
*
* This script finds and removes duplicate entries from the Elasticsearch index.
* This script finds and removes duplicate entries from Redis.
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
*
* Usage:
@@ -13,20 +13,20 @@
* Options:
* --dry-run Show duplicates without removing them (default)
* --execute Actually remove the duplicates
* --batch-size=<number> Number of items to process in each batch (default: 1000)
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
* --help, -h Show this help message
*/
import { Client } from '@elastic/elasticsearch';
import Redis from 'ioredis';
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 1000;
interface ParsedArgs {
dryRun: boolean;
batchSize: number;
field: string | null;
showHelp: boolean;
}
@@ -34,15 +34,23 @@ interface ParsedArgs {
interface DuplicateGroup {
value: string;
field: string;
documentIds: string[];
keepId: string;
deleteIds: string[];
plaintexts: string[];
keepPlaintext: string;
deletePlaintexts: string[];
}
interface HashDocument {
plaintext: string;
md5: string;
sha1: string;
sha256: string;
sha512: string;
created_at: string;
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
dryRun: true,
batchSize: DEFAULT_BATCH_SIZE,
field: null,
showHelp: false
};
@@ -56,21 +64,6 @@ function parseArgs(args: string[]): ParsedArgs {
result.dryRun = true;
} else if (arg === '--execute') {
result.dryRun = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++;
}
}
} else if (arg.startsWith('--field=')) {
result.field = arg.split('=')[1];
} else if (arg === '--field') {
@@ -96,13 +89,15 @@ Usage:
Options:
--dry-run Show duplicates without removing them (default)
--execute Actually remove the duplicates
--batch-size=<number> Number of items to process in each batch (default: 1000)
--field=<field> Check duplicates only on this field
Valid fields: plaintext, md5, sha1, sha256, sha512
--help, -h Show this help message
Environment Variables:
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
REDIS_HOST Redis host (default: localhost)
REDIS_PORT Redis port (default: 6379)
REDIS_PASSWORD Redis password (optional)
REDIS_DB Redis database number (default: 0)
Examples:
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
@@ -119,106 +114,78 @@ Notes:
}
async function findDuplicatesForField(
client: Client,
field: string,
batchSize: number
client: Redis,
field: string
): Promise<DuplicateGroup[]> {
const duplicates: DuplicateGroup[] = [];
// Use aggregation to find duplicate values
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
console.log(` Scanning for ${field} duplicates...`);
// Use composite aggregation to handle large number of duplicates
let afterKey: any = undefined;
let hasMore = true;
// Get all keys for this field type
const pattern = field === 'plaintext'
? 'hash:plaintext:*'
: `hash:index:${field}:*`;
console.log(` Scanning for duplicates...`);
const keys = await client.keys(pattern);
while (hasMore) {
const aggQuery: any = {
index: INDEX_NAME,
size: 0,
aggs: {
duplicates: {
composite: {
size: batchSize,
sources: [
{ value: { terms: { field: fieldToAggregate } } }
],
...(afterKey && { after: afterKey })
},
aggs: {
doc_count_filter: {
bucket_selector: {
buckets_path: { count: '_count' },
script: 'params.count > 1'
}
}
}
}
// For hash indexes, group by hash value (not plaintext)
const valueMap = new Map<string, string[]>();
if (field === 'plaintext') {
// Each key is already unique for plaintext
// Check for same plaintext with different created_at
for (const key of keys) {
const plaintext = key.replace('hash:plaintext:', '');
if (!valueMap.has(plaintext)) {
valueMap.set(plaintext, []);
}
};
const response = await client.search(aggQuery);
const compositeAgg = response.aggregations?.duplicates as any;
const buckets = compositeAgg?.buckets || [];
for (const bucket of buckets) {
if (bucket.doc_count > 1) {
const value = bucket.key.value;
// Use scroll API for large result sets
const documentIds: string[] = [];
let scrollResponse = await client.search({
index: INDEX_NAME,
scroll: '1m',
size: 1000,
query: {
term: {
[fieldToAggregate]: value
}
},
sort: [
{ created_at: { order: 'asc' } }
],
_source: false
});
while (scrollResponse.hits.hits.length > 0) {
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
if (!scrollResponse._scroll_id) break;
scrollResponse = await client.scroll({
scroll_id: scrollResponse._scroll_id,
scroll: '1m'
});
}
// Clear scroll
if (scrollResponse._scroll_id) {
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
}
if (documentIds.length > 1) {
duplicates.push({
value: String(value),
field,
documentIds,
keepId: documentIds[0], // Keep the oldest
deleteIds: documentIds.slice(1) // Delete the rest
});
valueMap.get(plaintext)!.push(plaintext);
}
} else {
// For hash fields, get the plaintext and check if multiple plaintexts have same hash
for (const key of keys) {
const hashValue = key.replace(`hash:index:${field}:`, '');
const plaintext = await client.get(key);
if (plaintext) {
if (!valueMap.has(hashValue)) {
valueMap.set(hashValue, []);
}
valueMap.get(hashValue)!.push(plaintext);
}
}
// Check if there are more results
afterKey = compositeAgg?.after_key;
hasMore = buckets.length === batchSize && afterKey;
}
// Find groups with duplicates
for (const [value, plaintexts] of valueMap) {
const uniquePlaintexts = Array.from(new Set(plaintexts));
if (hasMore) {
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
if (uniquePlaintexts.length > 1) {
// Get documents to compare timestamps
const docs: { plaintext: string; doc: HashDocument }[] = [];
for (const plaintext of uniquePlaintexts) {
const docKey = `hash:plaintext:${plaintext}`;
const docData = await client.get(docKey);
if (docData) {
docs.push({ plaintext, doc: JSON.parse(docData) });
}
}
// Sort by created_at (oldest first)
docs.sort((a, b) =>
new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
);
if (docs.length > 1) {
duplicates.push({
value,
field,
plaintexts: docs.map(d => d.plaintext),
keepPlaintext: docs[0].plaintext,
deletePlaintexts: docs.slice(1).map(d => d.plaintext)
});
}
}
}
@@ -226,44 +193,50 @@ async function findDuplicatesForField(
}
async function removeDuplicates(parsedArgs: ParsedArgs) {
const client = new Client({ node: ELASTICSEARCH_NODE });
const client = new Redis({
host: REDIS_HOST,
port: REDIS_PORT,
password: REDIS_PASSWORD,
db: REDIS_DB,
});
const fields = parsedArgs.field
? [parsedArgs.field]
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
: ['md5', 'sha1', 'sha256', 'sha512'];
console.log(`🔍 Hasher Duplicate Remover`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
console.log(`Batch size: ${parsedArgs.batchSize}`);
console.log(`Fields to check: ${fields.join(', ')}`);
console.log('');
try {
// Test connection
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('🔗 Connecting to Redis...');
await client.ping();
console.log('✅ Connected successfully\n');
// Get index stats
const countResponse = await client.count({ index: INDEX_NAME });
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
const stats = await client.hgetall('hash:stats');
const totalCount = parseInt(stats.count || '0', 10);
console.log(`📊 Total documents in index: ${totalCount}\n`);
const allDuplicates: DuplicateGroup[] = [];
const seenDeleteIds = new Set<string>();
const seenPlaintexts = new Set<string>();
// Find duplicates for each field
for (const field of fields) {
console.log(`🔍 Checking duplicates for field: ${field}...`);
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
const fieldDuplicates = await findDuplicatesForField(client, field);
// Filter out already seen delete IDs to avoid counting the same document multiple times
// Filter out already seen plaintexts
for (const dup of fieldDuplicates) {
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
if (newDeleteIds.length > 0) {
dup.deleteIds = newDeleteIds;
newDeleteIds.forEach(id => seenDeleteIds.add(id));
const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
if (newDeletePlaintexts.length > 0) {
dup.deletePlaintexts = newDeletePlaintexts;
newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
allDuplicates.push(dup);
}
}
@@ -271,7 +244,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
}
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`📋 Summary:`);
@@ -281,6 +254,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
if (allDuplicates.length === 0) {
console.log('✨ No duplicates found! Index is clean.\n');
await client.quit();
return;
}
@@ -293,8 +267,8 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
: dup.value;
console.log(` Field: ${dup.field}`);
console.log(` Value: ${truncatedValue}`);
console.log(` Keep: ${dup.keepId}`);
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
console.log(` Keep: ${dup.keepPlaintext}`);
console.log(` Delete: ${dup.deletePlaintexts.length} document(s)`);
console.log('');
}
@@ -307,6 +281,7 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
console.log(`🔎 DRY RUN - No changes made`);
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
await client.quit();
return;
}
@@ -315,53 +290,61 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
let deleted = 0;
let errors = 0;
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
// Delete in batches
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
try {
const bulkOperations = batch.flatMap(id => [
{ delete: { _index: INDEX_NAME, _id: id } }
]);
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
errors += errorCount;
deleted += batch.length - errorCount;
} else {
deleted += batch.length;
for (const dup of allDuplicates) {
for (const plaintext of dup.deletePlaintexts) {
try {
const docKey = `hash:plaintext:${plaintext}`;
const docData = await client.get(docKey);
if (docData) {
const doc: HashDocument = JSON.parse(docData);
const pipeline = client.pipeline();
// Delete main document
pipeline.del(docKey);
// Delete all indexes
pipeline.del(`hash:index:md5:${doc.md5}`);
pipeline.del(`hash:index:sha1:${doc.sha1}`);
pipeline.del(`hash:index:sha256:${doc.sha256}`);
pipeline.del(`hash:index:sha512:${doc.sha512}`);
// Update statistics
pipeline.hincrby('hash:stats', 'count', -1);
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
const results = await pipeline.exec();
if (results && results.some(([err]) => err !== null)) {
errors++;
} else {
deleted++;
}
}
process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
} catch (error) {
console.error(`\n❌ Error deleting ${plaintext}:`, error);
errors++;
}
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
} catch (error) {
console.error(`\n❌ Error deleting batch:`, error);
errors += batch.length;
}
}
// Refresh index
console.log('\n\n🔄 Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
// Get new count
const newCountResponse = await client.count({ index: INDEX_NAME });
const newStats = await client.hgetall('hash:stats');
const newCount = parseInt(newStats.count || '0', 10);
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${deleted}`);
console.log(`Errors: ${errors}`);
console.log(`Previous document count: ${countResponse.count}`);
console.log(`New document count: ${newCountResponse.count}`);
console.log(`Previous document count: ${totalCount}`);
console.log(`New document count: ${newCount}`);
console.log('');
await client.quit();
} catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
@@ -386,11 +369,10 @@ if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
console.log(`\n🔧 Configuration:`);
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
if (parsedArgs.field) {
console.log(` Field: ${parsedArgs.field}`);
} else {
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
console.log(` Fields: all (md5, sha1, sha256, sha512)`);
}
console.log('');