380 líneas
12 KiB
JavaScript
380 líneas
12 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Hasher Duplicate Remover Script
|
|
*
|
|
* This script finds and removes duplicate entries from Redis.
|
|
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/remove-duplicates.ts [options]
|
|
* npm run remove-duplicates [-- options]
|
|
*
|
|
* Options:
|
|
* --dry-run Show duplicates without removing them (default)
|
|
* --execute Actually remove the duplicates
|
|
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
|
|
* --help, -h Show this help message
|
|
*/
|
|
|
|
import Redis from 'ioredis';
|
|
|
|
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
|
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
|
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
|
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
|
const INDEX_NAME = 'hasher';
|
|
|
|
interface ParsedArgs {
|
|
dryRun: boolean;
|
|
field: string | null;
|
|
showHelp: boolean;
|
|
}
|
|
|
|
interface DuplicateGroup {
|
|
value: string;
|
|
field: string;
|
|
plaintexts: string[];
|
|
keepPlaintext: string;
|
|
deletePlaintexts: string[];
|
|
}
|
|
|
|
interface HashDocument {
|
|
plaintext: string;
|
|
md5: string;
|
|
sha1: string;
|
|
sha256: string;
|
|
sha512: string;
|
|
created_at: string;
|
|
}
|
|
|
|
function parseArgs(args: string[]): ParsedArgs {
|
|
const result: ParsedArgs = {
|
|
dryRun: true,
|
|
field: null,
|
|
showHelp: false
|
|
};
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
const arg = args[i];
|
|
|
|
if (arg === '--help' || arg === '-h') {
|
|
result.showHelp = true;
|
|
} else if (arg === '--dry-run') {
|
|
result.dryRun = true;
|
|
} else if (arg === '--execute') {
|
|
result.dryRun = false;
|
|
} else if (arg.startsWith('--field=')) {
|
|
result.field = arg.split('=')[1];
|
|
} else if (arg === '--field') {
|
|
const nextArg = args[i + 1];
|
|
if (nextArg && !nextArg.startsWith('-')) {
|
|
result.field = nextArg;
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
function showHelp() {
|
|
console.log(`
|
|
Hasher Duplicate Remover Script
|
|
|
|
Usage:
|
|
npx tsx scripts/remove-duplicates.ts [options]
|
|
npm run remove-duplicates [-- options]
|
|
|
|
Options:
|
|
--dry-run Show duplicates without removing them (default)
|
|
--execute Actually remove the duplicates
|
|
--field=<field> Check duplicates only on this field
|
|
Valid fields: plaintext, md5, sha1, sha256, sha512
|
|
--help, -h Show this help message
|
|
|
|
Environment Variables:
|
|
REDIS_HOST Redis host (default: localhost)
|
|
REDIS_PORT Redis port (default: 6379)
|
|
REDIS_PASSWORD Redis password (optional)
|
|
REDIS_DB Redis database number (default: 0)
|
|
|
|
Examples:
|
|
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
|
|
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
|
|
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
|
|
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
|
|
|
|
Notes:
|
|
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
|
|
- Always run with --dry-run first to review what will be deleted
|
|
- Duplicates are checked across all hash fields by default
|
|
`);
|
|
process.exit(0);
|
|
}
|
|
|
|
async function findDuplicatesForField(
|
|
client: Redis,
|
|
field: string
|
|
): Promise<DuplicateGroup[]> {
|
|
const duplicates: DuplicateGroup[] = [];
|
|
|
|
console.log(` Scanning for ${field} duplicates...`);
|
|
|
|
// Get all keys for this field type
|
|
const pattern = field === 'plaintext'
|
|
? 'hash:plaintext:*'
|
|
: `hash:index:${field}:*`;
|
|
|
|
const keys = await client.keys(pattern);
|
|
|
|
// For hash indexes, group by hash value (not plaintext)
|
|
const valueMap = new Map<string, string[]>();
|
|
|
|
if (field === 'plaintext') {
|
|
// Each key is already unique for plaintext
|
|
// Check for same plaintext with different created_at
|
|
for (const key of keys) {
|
|
const plaintext = key.replace('hash:plaintext:', '');
|
|
if (!valueMap.has(plaintext)) {
|
|
valueMap.set(plaintext, []);
|
|
}
|
|
valueMap.get(plaintext)!.push(plaintext);
|
|
}
|
|
} else {
|
|
// For hash fields, get the plaintext and check if multiple plaintexts have same hash
|
|
for (const key of keys) {
|
|
const hashValue = key.replace(`hash:index:${field}:`, '');
|
|
const plaintext = await client.get(key);
|
|
|
|
if (plaintext) {
|
|
if (!valueMap.has(hashValue)) {
|
|
valueMap.set(hashValue, []);
|
|
}
|
|
valueMap.get(hashValue)!.push(plaintext);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find groups with duplicates
|
|
for (const [value, plaintexts] of valueMap) {
|
|
const uniquePlaintexts = Array.from(new Set(plaintexts));
|
|
|
|
if (uniquePlaintexts.length > 1) {
|
|
// Get documents to compare timestamps
|
|
const docs: { plaintext: string; doc: HashDocument }[] = [];
|
|
|
|
for (const plaintext of uniquePlaintexts) {
|
|
const docKey = `hash:plaintext:${plaintext}`;
|
|
const docData = await client.get(docKey);
|
|
if (docData) {
|
|
docs.push({ plaintext, doc: JSON.parse(docData) });
|
|
}
|
|
}
|
|
|
|
// Sort by created_at (oldest first)
|
|
docs.sort((a, b) =>
|
|
new Date(a.doc.created_at).getTime() - new Date(b.doc.created_at).getTime()
|
|
);
|
|
|
|
if (docs.length > 1) {
|
|
duplicates.push({
|
|
value,
|
|
field,
|
|
plaintexts: docs.map(d => d.plaintext),
|
|
keepPlaintext: docs[0].plaintext,
|
|
deletePlaintexts: docs.slice(1).map(d => d.plaintext)
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return duplicates;
|
|
}
|
|
|
|
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
|
const client = new Redis({
|
|
host: REDIS_HOST,
|
|
port: REDIS_PORT,
|
|
password: REDIS_PASSWORD,
|
|
db: REDIS_DB,
|
|
});
|
|
|
|
const fields = parsedArgs.field
|
|
? [parsedArgs.field]
|
|
: ['md5', 'sha1', 'sha256', 'sha512'];
|
|
|
|
console.log(`🔍 Hasher Duplicate Remover`);
|
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
|
|
console.log(`Index: ${INDEX_NAME}`);
|
|
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
|
console.log(`Fields to check: ${fields.join(', ')}`);
|
|
console.log('');
|
|
|
|
try {
|
|
// Test connection
|
|
console.log('🔗 Connecting to Redis...');
|
|
await client.ping();
|
|
console.log('✅ Connected successfully\n');
|
|
|
|
// Get index stats
|
|
const stats = await client.hgetall('hash:stats');
|
|
const totalCount = parseInt(stats.count || '0', 10);
|
|
console.log(`📊 Total documents in index: ${totalCount}\n`);
|
|
|
|
const allDuplicates: DuplicateGroup[] = [];
|
|
const seenPlaintexts = new Set<string>();
|
|
|
|
// Find duplicates for each field
|
|
for (const field of fields) {
|
|
console.log(`🔍 Checking duplicates for field: ${field}...`);
|
|
const fieldDuplicates = await findDuplicatesForField(client, field);
|
|
|
|
// Filter out already seen plaintexts
|
|
for (const dup of fieldDuplicates) {
|
|
const newDeletePlaintexts = dup.deletePlaintexts.filter(p => !seenPlaintexts.has(p));
|
|
if (newDeletePlaintexts.length > 0) {
|
|
dup.deletePlaintexts = newDeletePlaintexts;
|
|
newDeletePlaintexts.forEach(p => seenPlaintexts.add(p));
|
|
allDuplicates.push(dup);
|
|
}
|
|
}
|
|
|
|
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
|
}
|
|
|
|
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deletePlaintexts.length, 0);
|
|
|
|
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
console.log(`📋 Summary:`);
|
|
console.log(` Duplicate groups found: ${allDuplicates.length}`);
|
|
console.log(` Documents to delete: ${totalToDelete}`);
|
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
|
|
|
if (allDuplicates.length === 0) {
|
|
console.log('✨ No duplicates found! Index is clean.\n');
|
|
await client.quit();
|
|
return;
|
|
}
|
|
|
|
// Show sample of duplicates
|
|
console.log(`📝 Sample duplicates (showing first 10):\n`);
|
|
const samplesToShow = allDuplicates.slice(0, 10);
|
|
for (const dup of samplesToShow) {
|
|
const truncatedValue = dup.value.length > 50
|
|
? dup.value.substring(0, 50) + '...'
|
|
: dup.value;
|
|
console.log(` Field: ${dup.field}`);
|
|
console.log(` Value: ${truncatedValue}`);
|
|
console.log(` Keep: ${dup.keepPlaintext}`);
|
|
console.log(` Delete: ${dup.deletePlaintexts.length} document(s)`);
|
|
console.log('');
|
|
}
|
|
|
|
if (allDuplicates.length > 10) {
|
|
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
|
|
}
|
|
|
|
if (parsedArgs.dryRun) {
|
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
console.log(`🔎 DRY RUN - No changes made`);
|
|
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
|
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
|
await client.quit();
|
|
return;
|
|
}
|
|
|
|
// Execute deletion
|
|
console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`);
|
|
|
|
let deleted = 0;
|
|
let errors = 0;
|
|
|
|
for (const dup of allDuplicates) {
|
|
for (const plaintext of dup.deletePlaintexts) {
|
|
try {
|
|
const docKey = `hash:plaintext:${plaintext}`;
|
|
const docData = await client.get(docKey);
|
|
|
|
if (docData) {
|
|
const doc: HashDocument = JSON.parse(docData);
|
|
const pipeline = client.pipeline();
|
|
|
|
// Delete main document
|
|
pipeline.del(docKey);
|
|
|
|
// Delete all indexes
|
|
pipeline.del(`hash:index:md5:${doc.md5}`);
|
|
pipeline.del(`hash:index:sha1:${doc.sha1}`);
|
|
pipeline.del(`hash:index:sha256:${doc.sha256}`);
|
|
pipeline.del(`hash:index:sha512:${doc.sha512}`);
|
|
|
|
// Update statistics
|
|
pipeline.hincrby('hash:stats', 'count', -1);
|
|
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
|
|
|
|
const results = await pipeline.exec();
|
|
|
|
if (results && results.some(([err]) => err !== null)) {
|
|
errors++;
|
|
} else {
|
|
deleted++;
|
|
}
|
|
}
|
|
|
|
process.stdout.write(`\r⏳ Progress: ${deleted + errors}/${totalToDelete} - Deleted: ${deleted}, Errors: ${errors}`);
|
|
} catch (error) {
|
|
console.error(`\n❌ Error deleting ${plaintext}:`, error);
|
|
errors++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get new count
|
|
const newStats = await client.hgetall('hash:stats');
|
|
const newCount = parseInt(newStats.count || '0', 10);
|
|
|
|
console.log('\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
console.log('✅ Duplicate removal complete!');
|
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
console.log(`Documents deleted: ${deleted}`);
|
|
console.log(`Errors: ${errors}`);
|
|
console.log(`Previous document count: ${totalCount}`);
|
|
console.log(`New document count: ${newCount}`);
|
|
console.log('');
|
|
|
|
await client.quit();
|
|
} catch (error) {
|
|
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Parse command line arguments
|
|
const args = process.argv.slice(2);
|
|
const parsedArgs = parseArgs(args);
|
|
|
|
if (parsedArgs.showHelp) {
|
|
showHelp();
|
|
}
|
|
|
|
// Validate field if provided
|
|
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
|
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
|
|
console.error(`❌ Invalid field: ${parsedArgs.field}`);
|
|
console.error(` Valid fields: ${validFields.join(', ')}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log(`\n🔧 Configuration:`);
|
|
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
|
|
if (parsedArgs.field) {
|
|
console.log(` Field: ${parsedArgs.field}`);
|
|
} else {
|
|
console.log(` Fields: all (md5, sha1, sha256, sha512)`);
|
|
}
|
|
console.log('');
|
|
|
|
removeDuplicates(parsedArgs).catch(console.error);
|