351 líneas
10 KiB
JavaScript
351 líneas
10 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Hasher Duplicate Remover Script
|
|
*
|
|
* This script finds and removes duplicate entries from Redis.
|
|
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/remove-duplicates.ts [options]
|
|
* npm run remove-duplicates [-- options]
|
|
*
|
|
* Options:
|
|
* --dry-run Show duplicates without removing them (default)
|
|
* --execute Actually remove the duplicates
|
|
* --batch-size=<number> Number of keys to scan in each batch (default: 1000)
|
|
* --field=<field> Check duplicates only on this field (md5, sha1, sha256, sha512)
|
|
* --help, -h Show this help message
|
|
*/
|
|
|
|
import Redis from 'ioredis';
|
|
|
|
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
|
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
|
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
|
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
|
const DEFAULT_BATCH_SIZE = 1000;
|
|
|
|
interface HashDocument {
|
|
plaintext: string;
|
|
md5: string;
|
|
sha1: string;
|
|
sha256: string;
|
|
sha512: string;
|
|
created_at: string;
|
|
}
|
|
|
|
interface ParsedArgs {
|
|
dryRun: boolean;
|
|
batchSize: number;
|
|
field: string | null;
|
|
showHelp: boolean;
|
|
}
|
|
|
|
interface DuplicateGroup {
|
|
value: string;
|
|
field: string;
|
|
plaintexts: string[];
|
|
keepPlaintext: string;
|
|
deletePlaintexts: string[];
|
|
}
|
|
|
|
function parseArgs(args: string[]): ParsedArgs {
|
|
const result: ParsedArgs = {
|
|
dryRun: true,
|
|
batchSize: DEFAULT_BATCH_SIZE,
|
|
field: null,
|
|
showHelp: false
|
|
};
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
const arg = args[i];
|
|
|
|
if (arg === '--help' || arg === '-h') {
|
|
result.showHelp = true;
|
|
} else if (arg === '--dry-run') {
|
|
result.dryRun = true;
|
|
} else if (arg === '--execute') {
|
|
result.dryRun = false;
|
|
} else if (arg.startsWith('--batch-size=')) {
|
|
const value = arg.split('=')[1];
|
|
const parsed = parseInt(value, 10);
|
|
if (!isNaN(parsed) && parsed > 0) {
|
|
result.batchSize = parsed;
|
|
}
|
|
} else if (arg === '--batch-size') {
|
|
const nextArg = args[i + 1];
|
|
if (nextArg && !nextArg.startsWith('-')) {
|
|
const parsed = parseInt(nextArg, 10);
|
|
if (!isNaN(parsed) && parsed > 0) {
|
|
result.batchSize = parsed;
|
|
i++;
|
|
}
|
|
}
|
|
} else if (arg.startsWith('--field=')) {
|
|
result.field = arg.split('=')[1];
|
|
} else if (arg === '--field') {
|
|
const nextArg = args[i + 1];
|
|
if (nextArg && !nextArg.startsWith('-')) {
|
|
result.field = nextArg;
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
function showHelp() {
|
|
console.log(`
|
|
Hasher Duplicate Remover Script
|
|
|
|
Usage:
|
|
npx tsx scripts/remove-duplicates.ts [options]
|
|
npm run remove-duplicates [-- options]
|
|
|
|
Options:
|
|
--dry-run Show duplicates without removing them (default)
|
|
--execute Actually remove the duplicates
|
|
--batch-size=<number> Number of keys to scan in each batch (default: 1000)
|
|
--field=<field> Check duplicates only on this field
|
|
Valid fields: md5, sha1, sha256, sha512
|
|
--help, -h Show this help message
|
|
|
|
Environment Variables:
|
|
REDIS_HOST Redis host (default: localhost)
|
|
REDIS_PORT Redis port (default: 6379)
|
|
REDIS_PASSWORD Redis password (optional)
|
|
REDIS_DB Redis database number (default: 0)
|
|
|
|
Examples:
|
|
# Dry run (show duplicates only)
|
|
npm run remove-duplicates
|
|
|
|
# Actually remove duplicates
|
|
npm run remove-duplicates -- --execute
|
|
|
|
# Check only MD5 duplicates
|
|
npm run remove-duplicates -- --field=md5 --execute
|
|
|
|
Description:
|
|
This script scans through all hash documents in Redis and identifies
|
|
duplicates based on hash values. When duplicates are found, it keeps
|
|
the oldest entry (by created_at) and marks the rest for deletion.
|
|
`);
|
|
}
|
|
|
|
async function findDuplicatesForField(
|
|
client: Redis,
|
|
field: 'md5' | 'sha1' | 'sha256' | 'sha512',
|
|
batchSize: number
|
|
): Promise<DuplicateGroup[]> {
|
|
const pattern = `hash:index:${field}:*`;
|
|
const hashToPlaintexts: Map<string, string[]> = new Map();
|
|
|
|
console.log(`🔍 Scanning ${field} indexes...`);
|
|
|
|
let cursor = '0';
|
|
let keysScanned = 0;
|
|
|
|
do {
|
|
const [nextCursor, keys] = await client.scan(cursor, 'MATCH', pattern, 'COUNT', batchSize);
|
|
cursor = nextCursor;
|
|
keysScanned += keys.length;
|
|
|
|
for (const key of keys) {
|
|
const hash = key.replace(`hash:index:${field}:`, '');
|
|
const plaintext = await client.get(key);
|
|
|
|
if (plaintext) {
|
|
if (!hashToPlaintexts.has(hash)) {
|
|
hashToPlaintexts.set(hash, []);
|
|
}
|
|
hashToPlaintexts.get(hash)!.push(plaintext);
|
|
}
|
|
}
|
|
|
|
process.stdout.write(`\r Keys scanned: ${keysScanned} `);
|
|
} while (cursor !== '0');
|
|
|
|
console.log('');
|
|
|
|
const duplicates: DuplicateGroup[] = [];
|
|
|
|
for (const [hash, plaintexts] of hashToPlaintexts.entries()) {
|
|
if (plaintexts.length > 1) {
|
|
// Fetch documents to get created_at timestamps
|
|
const docs = await Promise.all(
|
|
plaintexts.map(async (pt) => {
|
|
const data = await client.get(`hash:plaintext:${pt}`);
|
|
return data ? JSON.parse(data) as HashDocument : null;
|
|
})
|
|
);
|
|
|
|
const validDocs = docs.filter((doc): doc is HashDocument => doc !== null);
|
|
|
|
if (validDocs.length > 1) {
|
|
// Sort by created_at, keep oldest
|
|
validDocs.sort((a, b) => a.created_at.localeCompare(b.created_at));
|
|
|
|
duplicates.push({
|
|
value: hash,
|
|
field,
|
|
plaintexts: validDocs.map(d => d.plaintext),
|
|
keepPlaintext: validDocs[0].plaintext,
|
|
deletePlaintexts: validDocs.slice(1).map(d => d.plaintext)
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return duplicates;
|
|
}
|
|
|
|
async function removeDuplicates(
|
|
client: Redis,
|
|
duplicates: DuplicateGroup[],
|
|
dryRun: boolean
|
|
): Promise<{ deleted: number; errors: number }> {
|
|
let deleted = 0;
|
|
let errors = 0;
|
|
|
|
console.log('');
|
|
console.log(`${dryRun ? '🔍 DRY RUN - Would delete:' : '🗑️ Deleting duplicates...'}`);
|
|
console.log('');
|
|
|
|
for (const dup of duplicates) {
|
|
console.log(`Duplicate ${dup.field}: ${dup.value}`);
|
|
console.log(` Keep: ${dup.keepPlaintext} (oldest)`);
|
|
console.log(` Delete: ${dup.deletePlaintexts.join(', ')}`);
|
|
|
|
if (!dryRun) {
|
|
for (const plaintext of dup.deletePlaintexts) {
|
|
try {
|
|
const docKey = `hash:plaintext:${plaintext}`;
|
|
const docData = await client.get(docKey);
|
|
|
|
if (docData) {
|
|
const doc: HashDocument = JSON.parse(docData);
|
|
const pipeline = client.pipeline();
|
|
|
|
// Delete the main document
|
|
pipeline.del(docKey);
|
|
|
|
// Delete all indexes
|
|
pipeline.del(`hash:index:md5:${doc.md5}`);
|
|
pipeline.del(`hash:index:sha1:${doc.sha1}`);
|
|
pipeline.del(`hash:index:sha256:${doc.sha256}`);
|
|
pipeline.del(`hash:index:sha512:${doc.sha512}`);
|
|
|
|
// Update statistics
|
|
pipeline.hincrby('hash:stats', 'count', -1);
|
|
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
|
|
|
|
const results = await pipeline.exec();
|
|
|
|
if (results && results.some(([err]) => err !== null)) {
|
|
errors++;
|
|
} else {
|
|
deleted++;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.error(` Error deleting ${plaintext}:`, error);
|
|
errors++;
|
|
}
|
|
}
|
|
}
|
|
console.log('');
|
|
}
|
|
|
|
return { deleted, errors };
|
|
}
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
const parsed = parseArgs(args);
|
|
|
|
if (parsed.showHelp) {
|
|
showHelp();
|
|
process.exit(0);
|
|
}
|
|
|
|
const validFields: Array<'md5' | 'sha1' | 'sha256' | 'sha512'> = ['md5', 'sha1', 'sha256', 'sha512'];
|
|
const fieldsToCheck = parsed.field
|
|
? [parsed.field as 'md5' | 'sha1' | 'sha256' | 'sha512']
|
|
: validFields;
|
|
|
|
// Validate field
|
|
if (parsed.field && !validFields.includes(parsed.field as any)) {
|
|
console.error(`❌ Invalid field: ${parsed.field}`);
|
|
console.error(` Valid fields: ${validFields.join(', ')}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const client = new Redis({
|
|
host: REDIS_HOST,
|
|
port: REDIS_PORT,
|
|
password: REDIS_PASSWORD,
|
|
db: REDIS_DB,
|
|
});
|
|
|
|
console.log('');
|
|
console.log('🔍 Hasher Duplicate Remover');
|
|
console.log('━'.repeat(42));
|
|
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT}`);
|
|
console.log(`Mode: ${parsed.dryRun ? 'DRY RUN' : 'EXECUTE'}`);
|
|
console.log(`Batch size: ${parsed.batchSize}`);
|
|
console.log(`Fields to check: ${fieldsToCheck.join(', ')}`);
|
|
console.log('');
|
|
|
|
try {
|
|
console.log('🔗 Connecting to Redis...');
|
|
await client.ping();
|
|
console.log('✅ Connected successfully\n');
|
|
|
|
const allDuplicates: DuplicateGroup[] = [];
|
|
|
|
for (const field of fieldsToCheck) {
|
|
const duplicates = await findDuplicatesForField(client, field, parsed.batchSize);
|
|
allDuplicates.push(...duplicates);
|
|
console.log(` Found ${duplicates.length} duplicate groups for ${field}`);
|
|
}
|
|
|
|
console.log('');
|
|
console.log(`📊 Total duplicate groups found: ${allDuplicates.length}`);
|
|
|
|
if (allDuplicates.length === 0) {
|
|
console.log('✅ No duplicates found!');
|
|
} else {
|
|
const totalToDelete = allDuplicates.reduce(
|
|
(sum, dup) => sum + dup.deletePlaintexts.length,
|
|
0
|
|
);
|
|
console.log(` Total documents to delete: ${totalToDelete}`);
|
|
|
|
const { deleted, errors } = await removeDuplicates(client, allDuplicates, parsed.dryRun);
|
|
|
|
if (!parsed.dryRun) {
|
|
console.log('━'.repeat(42));
|
|
console.log('✅ Removal complete!');
|
|
console.log('');
|
|
console.log('📊 Statistics:');
|
|
console.log(` Deleted: ${deleted}`);
|
|
console.log(` Errors: ${errors}`);
|
|
} else {
|
|
console.log('━'.repeat(42));
|
|
console.log('💡 This was a dry run. Use --execute to actually remove duplicates.');
|
|
}
|
|
}
|
|
|
|
await client.quit();
|
|
} catch (error) {
|
|
console.error('\n\n❌ Error:', error);
|
|
await client.quit();
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main();
|