Files
hasher/scripts/remove-duplicates.ts
2025-12-15 17:43:08 +01:00

351 líneas
10 KiB
JavaScript

#!/usr/bin/env node
/**
* Hasher Duplicate Remover Script
*
* This script finds and removes duplicate entries from Redis.
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
*
* Usage:
* npx tsx scripts/remove-duplicates.ts [options]
* npm run remove-duplicates [-- options]
*
* Options:
* --dry-run Show duplicates without removing them (default)
* --execute Actually remove the duplicates
* --batch-size=<number> Number of keys to scan in each batch (default: 1000)
* --field=<field> Check duplicates only on this field (md5, sha1, sha256, sha512)
* --help, -h Show this help message
*/
import Redis from 'ioredis';
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const DEFAULT_BATCH_SIZE = 1000;
interface HashDocument {
plaintext: string;
md5: string;
sha1: string;
sha256: string;
sha512: string;
created_at: string;
}
interface ParsedArgs {
dryRun: boolean;
batchSize: number;
field: string | null;
showHelp: boolean;
}
interface DuplicateGroup {
value: string;
field: string;
plaintexts: string[];
keepPlaintext: string;
deletePlaintexts: string[];
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
dryRun: true,
batchSize: DEFAULT_BATCH_SIZE,
field: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--dry-run') {
result.dryRun = true;
} else if (arg === '--execute') {
result.dryRun = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++;
}
}
} else if (arg.startsWith('--field=')) {
result.field = arg.split('=')[1];
} else if (arg === '--field') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.field = nextArg;
i++;
}
}
}
return result;
}
function showHelp() {
console.log(`
Hasher Duplicate Remover Script
Usage:
npx tsx scripts/remove-duplicates.ts [options]
npm run remove-duplicates [-- options]
Options:
--dry-run Show duplicates without removing them (default)
--execute Actually remove the duplicates
--batch-size=<number> Number of keys to scan in each batch (default: 1000)
--field=<field> Check duplicates only on this field
Valid fields: md5, sha1, sha256, sha512
--help, -h Show this help message
Environment Variables:
REDIS_HOST Redis host (default: localhost)
REDIS_PORT Redis port (default: 6379)
REDIS_PASSWORD Redis password (optional)
REDIS_DB Redis database number (default: 0)
Examples:
# Dry run (show duplicates only)
npm run remove-duplicates
# Actually remove duplicates
npm run remove-duplicates -- --execute
# Check only MD5 duplicates
npm run remove-duplicates -- --field=md5 --execute
Description:
This script scans through all hash documents in Redis and identifies
duplicates based on hash values. When duplicates are found, it keeps
the oldest entry (by created_at) and marks the rest for deletion.
`);
}
async function findDuplicatesForField(
client: Redis,
field: 'md5' | 'sha1' | 'sha256' | 'sha512',
batchSize: number
): Promise<DuplicateGroup[]> {
const pattern = `hash:index:${field}:*`;
const hashToPlaintexts: Map<string, string[]> = new Map();
console.log(`🔍 Scanning ${field} indexes...`);
let cursor = '0';
let keysScanned = 0;
do {
const [nextCursor, keys] = await client.scan(cursor, 'MATCH', pattern, 'COUNT', batchSize);
cursor = nextCursor;
keysScanned += keys.length;
for (const key of keys) {
const hash = key.replace(`hash:index:${field}:`, '');
const plaintext = await client.get(key);
if (plaintext) {
if (!hashToPlaintexts.has(hash)) {
hashToPlaintexts.set(hash, []);
}
hashToPlaintexts.get(hash)!.push(plaintext);
}
}
process.stdout.write(`\r Keys scanned: ${keysScanned} `);
} while (cursor !== '0');
console.log('');
const duplicates: DuplicateGroup[] = [];
for (const [hash, plaintexts] of hashToPlaintexts.entries()) {
if (plaintexts.length > 1) {
// Fetch documents to get created_at timestamps
const docs = await Promise.all(
plaintexts.map(async (pt) => {
const data = await client.get(`hash:plaintext:${pt}`);
return data ? JSON.parse(data) as HashDocument : null;
})
);
const validDocs = docs.filter((doc): doc is HashDocument => doc !== null);
if (validDocs.length > 1) {
// Sort by created_at, keep oldest
validDocs.sort((a, b) => a.created_at.localeCompare(b.created_at));
duplicates.push({
value: hash,
field,
plaintexts: validDocs.map(d => d.plaintext),
keepPlaintext: validDocs[0].plaintext,
deletePlaintexts: validDocs.slice(1).map(d => d.plaintext)
});
}
}
}
return duplicates;
}
async function removeDuplicates(
client: Redis,
duplicates: DuplicateGroup[],
dryRun: boolean
): Promise<{ deleted: number; errors: number }> {
let deleted = 0;
let errors = 0;
console.log('');
console.log(`${dryRun ? '🔍 DRY RUN - Would delete:' : '🗑️ Deleting duplicates...'}`);
console.log('');
for (const dup of duplicates) {
console.log(`Duplicate ${dup.field}: ${dup.value}`);
console.log(` Keep: ${dup.keepPlaintext} (oldest)`);
console.log(` Delete: ${dup.deletePlaintexts.join(', ')}`);
if (!dryRun) {
for (const plaintext of dup.deletePlaintexts) {
try {
const docKey = `hash:plaintext:${plaintext}`;
const docData = await client.get(docKey);
if (docData) {
const doc: HashDocument = JSON.parse(docData);
const pipeline = client.pipeline();
// Delete the main document
pipeline.del(docKey);
// Delete all indexes
pipeline.del(`hash:index:md5:${doc.md5}`);
pipeline.del(`hash:index:sha1:${doc.sha1}`);
pipeline.del(`hash:index:sha256:${doc.sha256}`);
pipeline.del(`hash:index:sha512:${doc.sha512}`);
// Update statistics
pipeline.hincrby('hash:stats', 'count', -1);
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
const results = await pipeline.exec();
if (results && results.some(([err]) => err !== null)) {
errors++;
} else {
deleted++;
}
}
} catch (error) {
console.error(` Error deleting ${plaintext}:`, error);
errors++;
}
}
}
console.log('');
}
return { deleted, errors };
}
async function main() {
const args = process.argv.slice(2);
const parsed = parseArgs(args);
if (parsed.showHelp) {
showHelp();
process.exit(0);
}
const validFields: Array<'md5' | 'sha1' | 'sha256' | 'sha512'> = ['md5', 'sha1', 'sha256', 'sha512'];
const fieldsToCheck = parsed.field
? [parsed.field as 'md5' | 'sha1' | 'sha256' | 'sha512']
: validFields;
// Validate field
if (parsed.field && !validFields.includes(parsed.field as any)) {
console.error(`❌ Invalid field: ${parsed.field}`);
console.error(` Valid fields: ${validFields.join(', ')}`);
process.exit(1);
}
const client = new Redis({
host: REDIS_HOST,
port: REDIS_PORT,
password: REDIS_PASSWORD,
db: REDIS_DB,
});
console.log('');
console.log('🔍 Hasher Duplicate Remover');
console.log('━'.repeat(42));
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT}`);
console.log(`Mode: ${parsed.dryRun ? 'DRY RUN' : 'EXECUTE'}`);
console.log(`Batch size: ${parsed.batchSize}`);
console.log(`Fields to check: ${fieldsToCheck.join(', ')}`);
console.log('');
try {
console.log('🔗 Connecting to Redis...');
await client.ping();
console.log('✅ Connected successfully\n');
const allDuplicates: DuplicateGroup[] = [];
for (const field of fieldsToCheck) {
const duplicates = await findDuplicatesForField(client, field, parsed.batchSize);
allDuplicates.push(...duplicates);
console.log(` Found ${duplicates.length} duplicate groups for ${field}`);
}
console.log('');
console.log(`📊 Total duplicate groups found: ${allDuplicates.length}`);
if (allDuplicates.length === 0) {
console.log('✅ No duplicates found!');
} else {
const totalToDelete = allDuplicates.reduce(
(sum, dup) => sum + dup.deletePlaintexts.length,
0
);
console.log(` Total documents to delete: ${totalToDelete}`);
const { deleted, errors } = await removeDuplicates(client, allDuplicates, parsed.dryRun);
if (!parsed.dryRun) {
console.log('━'.repeat(42));
console.log('✅ Removal complete!');
console.log('');
console.log('📊 Statistics:');
console.log(` Deleted: ${deleted}`);
console.log(` Errors: ${errors}`);
} else {
console.log('━'.repeat(42));
console.log('💡 This was a dry run. Use --execute to actually remove duplicates.');
}
}
await client.quit();
} catch (error) {
console.error('\n\n❌ Error:', error);
await client.quit();
process.exit(1);
}
}
main();