Files
hasher/scripts/remove-duplicates.ts
2025-12-15 16:35:35 +01:00

380 líneas
12 KiB
JavaScript

#!/usr/bin/env node
/**
* Hasher Duplicate Remover Script
*
* This script finds and removes duplicate entries from Redis.
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
*
* Usage:
* npx tsx scripts/remove-duplicates.ts [options]
* npm run remove-duplicates [-- options]
*
* Options:
* --dry-run Show duplicates without removing them (default)
* --execute Actually remove the duplicates
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
* --help, -h Show this help message
*/
import Redis from 'ioredis';
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const INDEX_NAME = 'hasher';
interface ParsedArgs {
dryRun: boolean;
field: string | null;
showHelp: boolean;
}
interface DuplicateGroup {
value: string;
field: string;
plaintexts: string[];
keepPlaintext: string;
deletePlaintexts: string[];
}
interface HashDocument {
plaintext: string;
md5: string;
sha1: string;
sha256: string;
sha512: string;
created_at: string;
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
dryRun: true,
field: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--dry-run') {
result.dryRun = true;
} else if (arg === '--execute') {
result.dryRun = false;
} else if (arg.startsWith('--field=')) {
result.field = arg.split('=')[1];
} else if (arg === '--field') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.field = nextArg;
i++;
}
}
}
return result;
}
function showHelp() {
console.log(`
Hasher Duplicate Remover Script
Usage:
npx tsx scripts/remove-duplicates.ts [options]
npm run remove-duplicates [-- options]
Options:
--dry-run Show duplicates without removing them (default)
--execute Actually remove the duplicates
--field=<field> Check duplicates only on this field
Valid fields: plaintext, md5, sha1, sha256, sha512
--help, -h Show this help message
Environment Variables:
REDIS_HOST Redis host (default: localhost)
REDIS_PORT Redis port (default: 6379)
REDIS_PASSWORD Redis password (optional)
REDIS_DB Redis database number (default: 0)
Examples:
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
Notes:
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
- Always run with --dry-run first to review what will be deleted
- Duplicates are checked across all hash fields by default
`);
process.exit(0);
}
async function findDuplicatesForField(
client: Redis,
field: string
): Promise<DuplicateGroup[]> {
const duplicates: DuplicateGroup[] = [];
console.log(` Scanning for ${field} duplicates...`);
// Get all keys for this field type
const pattern = field === 'plaintext'
? 'hash:plaintext:*'
: `hash:index:${field}:*`;
const keys = await client.keys(pattern);
// For hash indexes, group by hash value (not plaintext)
const valueMap = new Map<string, string[]>();
if (field === 'plaintext') {
// Each key is already unique for plaintext
// Check for same plaintext with different created_at
for (const key of keys) {
const plaintext = key.replace('hash:plaintext:', '');
if (!valueMap.has(plaintext)) {
valueMap.set(plaintext, []);
}
valueMap.get(plaintext)!.push(plaintext);
}
} else {
// For hash fields, get the plaintext and check if multiple plaintexts have same hash
for (const key of keys) {
const hashValue = key.replace(`hash:index:${field}:`, '');
const plaintext = await client.get(key);
if (plaintext) {
if (!valueMap.has(hashValue)) {
valueMap.set(hashValue, []);
}
valueMap.get(hashValue)!.push(plaintext);
}
}
}
// Find groups with duplicates
for (const [value, plaintexts] of valueMap) {
const uniquePlaintexts = Array.from(new Set(plaintexts));
if (uniquePlaintexts.length > 1) {
// Get documents to compare timestamps
const docs: { plaintext: string; doc: HashDocument }[] = [];
for (const plaintext of uniquePlaintexts) {
const docKey = `hash:plaintext:${plaintext}`;
const docData = await client.get(docKey);
if (docData) {
docs.push({ plaintext, doc: JSON.parse(docData) });
}
}
// Sort by created_at (oldest first)
docs.sort((a, b) =>
new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
);
if (docs.length > 1) {
duplicates.push({
value,
field,
plaintexts: docs.map(d => d.plaintext),
keepPlaintext: docs[0].plaintext,
deletePlaintexts: docs.slice(1).map(d => d.plaintext)
});
}
}
}
return duplicates;
}
async function removeDuplicates(parsedArgs: ParsedArgs) {
const client = new Redis({
host: REDIS_HOST,
port: REDIS_PORT,
password: REDIS_PASSWORD,
db: REDIS_DB,
});
const fields = parsedArgs.field
? [parsedArgs.field]
: ['md5', 'sha1', 'sha256', 'sha512'];
console.log(`🔍 Hasher Duplicate Remover`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
console.log(`Fields to check: ${fields.join(', ')}`);
console.log('');
try {
// Test connection
console.log('🔗 Connecting to Redis...');
await client.ping();
console.log('✅ Connected successfully\n');
// Get index stats
const stats = await client.hgetall('hash:stats');
const totalCount = parseInt(stats.count || '0', 10);
console.log(`📊 Total documents in index: ${totalCount}\n`);
const allDuplicates: DuplicateGroup[] = [];
const seenPlaintexts = new Set<string>();
// Find duplicates for each field
for (const field of fields) {
console.log(`🔍 Checking duplicates for field: ${field}...`);
const fieldDuplicates = await findDuplicatesForField(client, field);
// Filter out already seen plaintexts
for (const dup of fieldDuplicates) {
const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
if (newDeletePlaintexts.length > 0) {
dup.deletePlaintexts = newDeletePlaintexts;
newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
allDuplicates.push(dup);
}
}
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
}
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`📋 Summary:`);
console.log(` Duplicate groups found: ${allDuplicates.length}`);
console.log(` Documents to delete: ${totalToDelete}`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
if (allDuplicates.length === 0) {
console.log('✨ No duplicates found! Index is clean.\n');
await client.quit();
return;
}
// Show sample of duplicates
console.log(`📝 Sample duplicates (showing first 10):\n`);
const samplesToShow = allDuplicates.slice(0, 10);
for (const dup of samplesToShow) {
const truncatedValue = dup.value.length > 50
? dup.value.substring(0, 50) + '...'
: dup.value;
console.log(` Field: ${dup.field}`);
console.log(` Value: ${truncatedValue}`);
console.log(` Keep: ${dup.keepPlaintext}`);
console.log(` Delete: ${dup.deletePlaintexts.length} document(s)`);
console.log('');
}
if (allDuplicates.length > 10) {
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
}
if (parsedArgs.dryRun) {
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`🔎 DRY RUN - No changes made`);
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
await client.quit();
return;
}
// Execute deletion
console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`);
let deleted = 0;
let errors = 0;
for (const dup of allDuplicates) {
for (const plaintext of dup.deletePlaintexts) {
try {
const docKey = `hash:plaintext:${plaintext}`;
const docData = await client.get(docKey);
if (docData) {
const doc: HashDocument = JSON.parse(docData);
const pipeline = client.pipeline();
// Delete main document
pipeline.del(docKey);
// Delete all indexes
pipeline.del(`hash:index:md5:${doc.md5}`);
pipeline.del(`hash:index:sha1:${doc.sha1}`);
pipeline.del(`hash:index:sha256:${doc.sha256}`);
pipeline.del(`hash:index:sha512:${doc.sha512}`);
// Update statistics
pipeline.hincrby('hash:stats', 'count', -1);
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
const results = await pipeline.exec();
if (results && results.some(([err]) => err !== null)) {
errors++;
} else {
deleted++;
}
}
process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
} catch (error) {
console.error(`\n❌ Error deleting ${plaintext}:`, error);
errors++;
}
}
}
// Get new count
const newStats = await client.hgetall('hash:stats');
const newCount = parseInt(newStats.count || '0', 10);
console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${deleted}`);
console.log(`Errors: ${errors}`);
console.log(`Previous document count: ${totalCount}`);
console.log(`New document count: ${newCount}`);
console.log('');
await client.quit();
} catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);
if (parsedArgs.showHelp) {
showHelp();
}
// Validate field if provided
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
console.error(`❌ Invalid field: ${parsedArgs.field}`);
console.error(` Valid fields: ${validFields.join(', ')}`);
process.exit(1);
}
console.log(`\n🔧 Configuration:`);
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
if (parsedArgs.field) {
console.log(` Field: ${parsedArgs.field}`);
} else {
console.log(` Fields: all (md5, sha1, sha256, sha512)`);
}
console.log('');
removeDuplicates(parsedArgs).catch(console.error);