#!/usr/bin/env node /** * Hasher Indexer Script * * This script reads a text file with one word/phrase per line and indexes * all the generated hashes into Redis. * * Usage: * npx tsx scripts/index-file.ts [options] * npm run index-file -- [options] * * Options: * --batch-size= Number of items to process in each batch (default: 100) * --resume Resume from last saved state (default: true) * --no-resume Start from beginning, ignore saved state * --no-check Skip duplicate checking (faster, but may create duplicates) * --state-file= Custom state file path (default: .indexer-state-.json) * --help, -h Show this help message */ import Redis from 'ioredis'; import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs'; import { resolve, basename } from 'path'; import { createInterface } from 'readline'; import crypto from 'crypto'; const REDIS_HOST = process.env.REDIS_HOST || 'localhost'; const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10); const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined; const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10); const INDEX_NAME = 'hasher'; const DEFAULT_BATCH_SIZE = 100; interface HashDocument { plaintext: string; md5: string; sha1: string; sha256: string; sha512: string; created_at: string; } interface IndexerState { filePath: string; fileHash: string; lastProcessedLine: number; totalLines: number; indexed: number; skipped: number; errors: number; startTime: number; lastUpdate: string; } interface ParsedArgs { filePath: string | null; batchSize: number; resume: boolean; checkDuplicates: boolean; stateFile: string | null; showHelp: boolean; } function parseArgs(args: string[]): ParsedArgs { const result: ParsedArgs = { filePath: null, batchSize: DEFAULT_BATCH_SIZE, resume: true, checkDuplicates: true, stateFile: null, showHelp: false }; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--help' || arg === '-h') { result.showHelp = true; } else if (arg === '--resume') { result.resume = true; } else if (arg === '--no-resume') { result.resume = false; } else if (arg === '--no-check') { result.checkDuplicates = false; } else if (arg.startsWith('--batch-size=')) { const value = arg.split('=')[1]; const parsed = parseInt(value, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; } } else if (arg === '--batch-size') { // Support --batch-size format const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { const parsed = parseInt(nextArg, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; i++; // Skip next argument } } } else if (arg.startsWith('--state-file=')) { result.stateFile = arg.split('=')[1]; } else if (arg === '--state-file') { const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { result.stateFile = nextArg; i++; } } else if (!arg.startsWith('-')) { // Positional argument - treat as file path result.filePath = arg; } } return result; } function getFileHash(filePath: string): string { // Create a hash based on file path and size for quick identification const stats = require('fs').statSync(filePath); const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`; return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8); } function getDefaultStateFile(filePath: string): string { const fileName = basename(filePath).replace(/\.[^.]+$/, ''); return resolve(`.indexer-state-${fileName}.json`); } function loadState(stateFile: string): IndexerState | null { try { if (existsSync(stateFile)) { const data = readFileSync(stateFile, 'utf-8'); return JSON.parse(data) as IndexerState; } } catch (error) { console.warn(`⚠️ Could not load state file: ${error}`); } return null; } function saveState(stateFile: string, state: IndexerState): void { try { state.lastUpdate = new Date().toISOString(); writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8'); } catch (error) { console.error(`❌ Could not save state file: ${error}`); } } function deleteState(stateFile: string): void { try { if (existsSync(stateFile)) { unlinkSync(stateFile); } } catch (error) { console.warn(`⚠️ Could not delete state file: ${error}`); } } async function generateHashes(plaintext: string): Promise { return { plaintext, md5: crypto.createHash('md5').update(plaintext).digest('hex'), sha1: crypto.createHash('sha1').update(plaintext).digest('hex'), sha256: crypto.createHash('sha256').update(plaintext).digest('hex'), sha512: crypto.createHash('sha512').update(plaintext).digest('hex'), created_at: new Date().toISOString() }; } function showHelp() { console.log(` Hasher Indexer Script Usage: npx tsx scripts/index-file.ts [options] npm run index-file -- [options] Options: --batch-size= Number of items to process in each batch (default: 100) --batch-size Alternative syntax for batch size --resume Resume from last saved state (default) --no-resume Start from beginning, ignore saved state --no-check Skip duplicate checking (faster, but may create duplicates) --state-file= Custom state file path --help, -h Show this help message Environment Variables: REDIS_HOST Redis host (default: localhost) REDIS_PORT Redis port (default: 6379) REDIS_PASSWORD Redis password (optional) REDIS_DB Redis database number (default: 0) Examples: npx tsx scripts/index-file.ts wordlist.txt npx tsx scripts/index-file.ts wordlist.txt --batch-size=500 npx tsx scripts/index-file.ts wordlist.txt --batch-size 500 npx tsx scripts/index-file.ts wordlist.txt --no-resume npx tsx scripts/index-file.ts wordlist.txt --no-check npm run index-file -- wordlist.txt --batch-size=500 --no-check State Management: The script automatically saves progress to a state file. If interrupted, it will resume from where it left off on the next run. Use --no-resume to start fresh. Duplicate Checking: By default, the script checks if each plaintext or hash already exists in the index before inserting. Use --no-check to skip this verification for faster indexing (useful when you're sure there are no duplicates). `); process.exit(0); } async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) { const client = new Redis({ host: REDIS_HOST, port: REDIS_PORT, password: REDIS_PASSWORD, db: REDIS_DB, retryStrategy: (times) => Math.min(times * 50, 2000), }); const absolutePath = resolve(filePath); const stateFile = customStateFile || getDefaultStateFile(absolutePath); const fileHash = getFileHash(absolutePath); // State management let state: IndexerState = { filePath: absolutePath, fileHash, lastProcessedLine: 0, totalLines: 0, indexed: 0, skipped: 0, errors: 0, startTime: Date.now(), lastUpdate: new Date().toISOString() }; // Check for existing state const existingState = loadState(stateFile); let resumingFrom = 0; if (shouldResume && existingState) { if (existingState.fileHash === fileHash) { state = existingState; resumingFrom = state.lastProcessedLine; state.startTime = Date.now(); // Reset start time for this session console.log(`📂 Found existing state, resuming from line ${resumingFrom}`); } else { console.log(`⚠️ File has changed since last run, starting fresh`); deleteState(stateFile); } } else if (!shouldResume) { deleteState(stateFile); } console.log(`📚 Hasher Indexer`); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`); console.log(`Index: ${INDEX_NAME}`); console.log(`File: ${filePath}`); console.log(`Batch size: ${batchSize}`); console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`); console.log(`State file: ${stateFile}`); if (resumingFrom > 0) { console.log(`Resuming from: line ${resumingFrom}`); console.log(`Already indexed: ${state.indexed}`); console.log(`Already skipped: ${state.skipped}`); } console.log(''); // Handle interruption signals let isInterrupted = false; const handleInterrupt = () => { if (isInterrupted) { console.log('\n\n⚡ Force quit. Progress saved.'); process.exit(1); } isInterrupted = true; console.log('\n\n⏸️ Interrupted! Saving state...'); saveState(stateFile, state); console.log(`💾 State saved to ${stateFile}`); console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`); console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`); process.exit(0); }; process.on('SIGINT', handleInterrupt); process.on('SIGTERM', handleInterrupt); try { // Test connection console.log('🔗 Connecting to Redis...'); await client.ping(); console.log('✅ Connected successfully\n'); // Process file line by line using streams console.log('📖 Processing file...\n'); let currentLineNumber = 0; let currentBatch: string[] = []; let sessionIndexed = 0; let sessionSkipped = 0; let sessionErrors = 0; const sessionStartTime = Date.now(); const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' }); const rl = createInterface({ input: fileStream, crlfDelay: Infinity }); const processBatch = async (batch: string[], lineNumber: number) => { if (batch.length === 0) return; if (isInterrupted) return; // Generate hashes for all items in batch first const batchWithHashes = await Promise.all( batch.map(async (plaintext: string) => ({ plaintext, hashes: await generateHashes(plaintext) })) ); const pipeline = client.pipeline(); let toIndex: typeof batchWithHashes = []; if (checkDuplicates) { // Check which items already exist const existenceChecks = await Promise.all( batchWithHashes.map(async (item) => { const plaintextExists = await client.exists(`hash:plaintext:${item.plaintext}`); if (plaintextExists) return { item, exists: true }; // Check if any hash exists const md5Exists = await client.exists(`hash:index:md5:${item.hashes.md5}`); const sha1Exists = await client.exists(`hash:index:sha1:${item.hashes.sha1}`); const sha256Exists = await client.exists(`hash:index:sha256:${item.hashes.sha256}`); const sha512Exists = await client.exists(`hash:index:sha512:${item.hashes.sha512}`); return { item, exists: md5Exists || sha1Exists || sha256Exists || sha512Exists }; }) ); for (const check of existenceChecks) { if (check.exists) { state.skipped++; sessionSkipped++; } else { toIndex.push(check.item); } } } else { // No duplicate checking - index everything toIndex = batchWithHashes; } // Execute bulk operations if (toIndex.length > 0) { try { for (const item of toIndex) { const doc = item.hashes; const key = `hash:plaintext:${doc.plaintext}`; // Store main document pipeline.set(key, JSON.stringify(doc)); // Create indexes for each hash type pipeline.set(`hash:index:md5:${doc.md5}`, doc.plaintext); pipeline.set(`hash:index:sha1:${doc.sha1}`, doc.plaintext); pipeline.set(`hash:index:sha256:${doc.sha256}`, doc.plaintext); pipeline.set(`hash:index:sha512:${doc.sha512}`, doc.plaintext); // Update statistics pipeline.hincrby('hash:stats', 'count', 1); pipeline.hincrby('hash:stats', 'size', JSON.stringify(doc).length); } const results = await pipeline.exec(); // Count errors const errorCount = results?.filter(([err]) => err !== null).length || 0; if (errorCount > 0) { state.errors += errorCount; sessionErrors += errorCount; const successCount = toIndex.length - errorCount; state.indexed += successCount; sessionIndexed += successCount; } else { state.indexed += toIndex.length; sessionIndexed += toIndex.length; } } catch (error) { console.error(`\n❌ Error processing batch:`, error); state.errors += toIndex.length; sessionErrors += toIndex.length; } } // Update state state.lastProcessedLine = lineNumber; state.totalLines = lineNumber; // Save state periodically (every 10 batches) if (lineNumber % (batchSize * 10) === 0) { saveState(stateFile, state); } // Progress indicator const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0); process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`); }; for await (const line of rl) { if (isInterrupted) break; currentLineNumber++; // Skip already processed lines if (currentLineNumber <= resumingFrom) { continue; } const trimmedLine = line.trim(); if (trimmedLine.length > 0) { // Only take first word (no spaces or separators) const firstWord = trimmedLine.split(/\s+/)[0]; if (firstWord) { currentBatch.push(firstWord); if (currentBatch.length >= batchSize) { await processBatch(currentBatch, currentLineNumber); currentBatch = []; } } } } // Process remaining items in last batch if (currentBatch.length > 0 && !isInterrupted) { await processBatch(currentBatch, currentLineNumber); } if (isInterrupted) { return; } // No refresh needed for Redis console.log('\n\n✅ All data persisted to Redis'); // Delete state file on successful completion deleteState(stateFile); const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2); const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0'; console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('✅ Indexing complete!'); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Total lines processed: ${currentLineNumber}`); if (resumingFrom > 0) { console.log(`Lines skipped (resumed): ${resumingFrom}`); console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`); } console.log(`Successfully indexed (total): ${state.indexed}`); console.log(`Successfully indexed (session): ${sessionIndexed}`); console.log(`Skipped duplicates (total): ${state.skipped}`); console.log(`Skipped duplicates (session): ${sessionSkipped}`); console.log(`Errors (total): ${state.errors}`); console.log(`Session duration: ${duration}s`); console.log(`Session rate: ${rate} docs/sec`); console.log(''); } catch (error) { // Save state on error saveState(stateFile, state); console.error(`\n💾 State saved to ${stateFile}`); console.error('❌ Error:', error instanceof Error ? error.message : error); process.exit(1); } finally { // Remove signal handlers process.removeListener('SIGINT', handleInterrupt); process.removeListener('SIGTERM', handleInterrupt); } } // Parse command line arguments const args = process.argv.slice(2); const parsedArgs = parseArgs(args); if (parsedArgs.showHelp || !parsedArgs.filePath) { showHelp(); } const filePath = parsedArgs.filePath as string; // Validate file exists if (!existsSync(filePath)) { console.error(`❌ File not found: ${filePath}`); process.exit(1); } console.log(`\n🔧 Configuration:`); console.log(` File: ${filePath}`); console.log(` Batch size: ${parsedArgs.batchSize}`); console.log(` Resume: ${parsedArgs.resume}`); console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`); if (parsedArgs.stateFile) { console.log(` State file: ${parsedArgs.stateFile}`); } console.log(''); indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);