#!/usr/bin/env node /** * Hasher Indexer Script * * This script reads a text file with one word/phrase per line and indexes * all the generated hashes into Elasticsearch. * * Usage: * npx tsx scripts/index-file.ts [options] * npm run index-file -- [options] * * Options: * --batch-size= Number of items to process in each batch (default: 100) * --resume Resume from last saved state (default: true) * --no-resume Start from beginning, ignore saved state * --no-check Skip duplicate checking (faster, but may create duplicates) * --state-file= Custom state file path (default: .indexer-state-.json) * --help, -h Show this help message */ import { Client } from '@elastic/elasticsearch'; import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs'; import { resolve, basename } from 'path'; import { createInterface } from 'readline'; import crypto from 'crypto'; const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200'; const INDEX_NAME = 'hasher'; const DEFAULT_BATCH_SIZE = 100; interface HashDocument { plaintext: string; md5: string; sha1: string; sha256: string; sha512: string; bcrypt: string; created_at: string; } interface IndexerState { filePath: string; fileHash: string; lastProcessedLine: number; totalLines: number; indexed: number; skipped: number; errors: number; startTime: number; lastUpdate: string; } interface ParsedArgs { filePath: string | null; batchSize: number; resume: boolean; checkDuplicates: boolean; stateFile: string | null; showHelp: boolean; } function parseArgs(args: string[]): ParsedArgs { const result: ParsedArgs = { filePath: null, batchSize: DEFAULT_BATCH_SIZE, resume: true, checkDuplicates: true, stateFile: null, showHelp: false }; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--help' || arg === '-h') { result.showHelp = true; } else if (arg === '--resume') { result.resume = true; } else if (arg === '--no-resume') { result.resume = false; } else if (arg === '--no-check') { result.checkDuplicates = false; } else if (arg.startsWith('--batch-size=')) { const value = arg.split('=')[1]; const parsed = parseInt(value, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; } } else if (arg === '--batch-size') { // Support --batch-size format const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { const parsed = parseInt(nextArg, 10); if (!isNaN(parsed) && parsed > 0) { result.batchSize = parsed; i++; // Skip next argument } } } else if (arg.startsWith('--state-file=')) { result.stateFile = arg.split('=')[1]; } else if (arg === '--state-file') { const nextArg = args[i + 1]; if (nextArg && !nextArg.startsWith('-')) { result.stateFile = nextArg; i++; } } else if (!arg.startsWith('-')) { // Positional argument - treat as file path result.filePath = arg; } } return result; } function getFileHash(filePath: string): string { // Create a hash based on file path and size for quick identification const stats = require('fs').statSync(filePath); const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`; return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8); } function getDefaultStateFile(filePath: string): string { const fileName = basename(filePath).replace(/\.[^.]+$/, ''); return resolve(`.indexer-state-${fileName}.json`); } function loadState(stateFile: string): IndexerState | null { try { if (existsSync(stateFile)) { const data = readFileSync(stateFile, 'utf-8'); return JSON.parse(data) as IndexerState; } } catch (error) { console.warn(`⚠️ Could not load state file: ${error}`); } return null; } function saveState(stateFile: string, state: IndexerState): void { try { state.lastUpdate = new Date().toISOString(); writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8'); } catch (error) { console.error(`❌ Could not save state file: ${error}`); } } function deleteState(stateFile: string): void { try { if (existsSync(stateFile)) { unlinkSync(stateFile); } } catch (error) { console.warn(`⚠️ Could not delete state file: ${error}`); } } async function generateHashes(plaintext: string): Promise { const bcrypt = await import('bcrypt'); const bcryptHash = await bcrypt.default.hash(plaintext, 10); return { plaintext, md5: crypto.createHash('md5').update(plaintext).digest('hex'), sha1: crypto.createHash('sha1').update(plaintext).digest('hex'), sha256: crypto.createHash('sha256').update(plaintext).digest('hex'), sha512: crypto.createHash('sha512').update(plaintext).digest('hex'), bcrypt: bcryptHash, created_at: new Date().toISOString() }; } function showHelp() { console.log(` Hasher Indexer Script Usage: npx tsx scripts/index-file.ts [options] npm run index-file -- [options] Options: --batch-size= Number of items to process in each batch (default: 100) --batch-size Alternative syntax for batch size --resume Resume from last saved state (default) --no-resume Start from beginning, ignore saved state --no-check Skip duplicate checking (faster, but may create duplicates) --state-file= Custom state file path --help, -h Show this help message Environment Variables: ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200) Examples: npx tsx scripts/index-file.ts wordlist.txt npx tsx scripts/index-file.ts wordlist.txt --batch-size=500 npx tsx scripts/index-file.ts wordlist.txt --batch-size 500 npx tsx scripts/index-file.ts wordlist.txt --no-resume npx tsx scripts/index-file.ts wordlist.txt --no-check npm run index-file -- wordlist.txt --batch-size=500 --no-check State Management: The script automatically saves progress to a state file. If interrupted, it will resume from where it left off on the next run. Use --no-resume to start fresh. Duplicate Checking: By default, the script checks if each plaintext or hash already exists in the index before inserting. Use --no-check to skip this verification for faster indexing (useful when you're sure there are no duplicates). `); process.exit(0); } async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) { const client = new Client({ node: ELASTICSEARCH_NODE }); const absolutePath = resolve(filePath); const stateFile = customStateFile || getDefaultStateFile(absolutePath); const fileHash = getFileHash(absolutePath); // State management let state: IndexerState = { filePath: absolutePath, fileHash, lastProcessedLine: 0, totalLines: 0, indexed: 0, skipped: 0, errors: 0, startTime: Date.now(), lastUpdate: new Date().toISOString() }; // Check for existing state const existingState = loadState(stateFile); let resumingFrom = 0; if (shouldResume && existingState) { if (existingState.fileHash === fileHash) { state = existingState; resumingFrom = state.lastProcessedLine; state.startTime = Date.now(); // Reset start time for this session console.log(`📂 Found existing state, resuming from line ${resumingFrom}`); } else { console.log(`⚠️ File has changed since last run, starting fresh`); deleteState(stateFile); } } else if (!shouldResume) { deleteState(stateFile); } console.log(`📚 Hasher Indexer`); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`); console.log(`Index: ${INDEX_NAME}`); console.log(`File: ${filePath}`); console.log(`Batch size: ${batchSize}`); console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`); console.log(`State file: ${stateFile}`); if (resumingFrom > 0) { console.log(`Resuming from: line ${resumingFrom}`); console.log(`Already indexed: ${state.indexed}`); console.log(`Already skipped: ${state.skipped}`); } console.log(''); // Handle interruption signals let isInterrupted = false; const handleInterrupt = () => { if (isInterrupted) { console.log('\n\n⚡ Force quit. Progress saved.'); process.exit(1); } isInterrupted = true; console.log('\n\n⏸️ Interrupted! Saving state...'); saveState(stateFile, state); console.log(`💾 State saved to ${stateFile}`); console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`); console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`); process.exit(0); }; process.on('SIGINT', handleInterrupt); process.on('SIGTERM', handleInterrupt); try { // Test connection console.log('🔗 Connecting to Elasticsearch...'); await client.cluster.health({}); console.log('✅ Connected successfully\n'); // Process file line by line using streams console.log('📖 Processing file...\n'); let currentLineNumber = 0; let currentBatch: string[] = []; let sessionIndexed = 0; let sessionSkipped = 0; let sessionErrors = 0; const sessionStartTime = Date.now(); const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' }); const rl = createInterface({ input: fileStream, crlfDelay: Infinity }); const processBatch = async (batch: string[], lineNumber: number) => { if (batch.length === 0) return; if (isInterrupted) return; const bulkOperations: any[] = []; // Generate hashes for all items in batch first const batchWithHashes = await Promise.all( batch.map(async (plaintext: string) => ({ plaintext, hashes: await generateHashes(plaintext) })) ); if (checkDuplicates) { // Check which items already exist (by plaintext or any hash) const md5List = batchWithHashes.map((item: any) => item.hashes.md5); const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1); const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256); const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512); const existingCheck = await client.search({ index: INDEX_NAME, size: batchSize * 5, query: { bool: { should: [ { terms: { 'plaintext.keyword': batch } }, { terms: { md5: md5List } }, { terms: { sha1: sha1List } }, { terms: { sha256: sha256List } }, { terms: { sha512: sha512List } }, ], minimum_should_match: 1 } }, _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'] }); // Create a set of existing hashes for quick lookup const existingHashes = new Set(); existingCheck.hits.hits.forEach((hit: any) => { const src = hit._source; existingHashes.add(src.plaintext); existingHashes.add(src.md5); existingHashes.add(src.sha1); existingHashes.add(src.sha256); existingHashes.add(src.sha512); }); // Prepare bulk operations only for items that don't have any duplicate hash for (const item of batchWithHashes) { const isDuplicate = existingHashes.has(item.plaintext) || existingHashes.has(item.hashes.md5) || existingHashes.has(item.hashes.sha1) || existingHashes.has(item.hashes.sha256) || existingHashes.has(item.hashes.sha512); if (!isDuplicate) { bulkOperations.push({ index: { _index: INDEX_NAME } }); bulkOperations.push(item.hashes); } else { state.skipped++; sessionSkipped++; } } } else { // No duplicate checking - index everything for (const item of batchWithHashes) { bulkOperations.push({ index: { _index: INDEX_NAME } }); bulkOperations.push(item.hashes); } } // Execute bulk operation only if there are new items to insert if (bulkOperations.length > 0) { try { const bulkResponse = await client.bulk({ operations: bulkOperations, refresh: false }); if (bulkResponse.errors) { const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length; state.errors += errorCount; sessionErrors += errorCount; const successCount = (bulkOperations.length / 2) - errorCount; state.indexed += successCount; sessionIndexed += successCount; } else { const count = bulkOperations.length / 2; state.indexed += count; sessionIndexed += count; } } catch (error) { console.error(`\n❌ Error processing batch:`, error); const count = bulkOperations.length / 2; state.errors += count; sessionErrors += count; } } // Update state state.lastProcessedLine = lineNumber; state.totalLines = lineNumber; // Save state periodically (every 10 batches) if (lineNumber % (batchSize * 10) === 0) { saveState(stateFile, state); } // Progress indicator const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0); process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`); }; for await (const line of rl) { if (isInterrupted) break; currentLineNumber++; // Skip already processed lines if (currentLineNumber <= resumingFrom) { continue; } const trimmedLine = line.trim(); if (trimmedLine.length > 0) { // Only take first word (no spaces or separators) const firstWord = trimmedLine.split(/\s+/)[0]; if (firstWord) { currentBatch.push(firstWord); if (currentBatch.length >= batchSize) { await processBatch(currentBatch, currentLineNumber); currentBatch = []; } } } } // Process remaining items in last batch if (currentBatch.length > 0 && !isInterrupted) { await processBatch(currentBatch, currentLineNumber); } if (isInterrupted) { return; } // Refresh index console.log('\n\n🔄 Refreshing index...'); await client.indices.refresh({ index: INDEX_NAME }); // Delete state file on successful completion deleteState(stateFile); const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2); const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0'; console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('✅ Indexing complete!'); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); console.log(`Total lines processed: ${currentLineNumber}`); if (resumingFrom > 0) { console.log(`Lines skipped (resumed): ${resumingFrom}`); console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`); } console.log(`Successfully indexed (total): ${state.indexed}`); console.log(`Successfully indexed (session): ${sessionIndexed}`); console.log(`Skipped duplicates (total): ${state.skipped}`); console.log(`Skipped duplicates (session): ${sessionSkipped}`); console.log(`Errors (total): ${state.errors}`); console.log(`Session duration: ${duration}s`); console.log(`Session rate: ${rate} docs/sec`); console.log(''); } catch (error) { // Save state on error saveState(stateFile, state); console.error(`\n💾 State saved to ${stateFile}`); console.error('❌ Error:', error instanceof Error ? error.message : error); process.exit(1); } finally { // Remove signal handlers process.removeListener('SIGINT', handleInterrupt); process.removeListener('SIGTERM', handleInterrupt); } } // Parse command line arguments const args = process.argv.slice(2); const parsedArgs = parseArgs(args); if (parsedArgs.showHelp || !parsedArgs.filePath) { showHelp(); } const filePath = parsedArgs.filePath as string; // Validate file exists if (!existsSync(filePath)) { console.error(`❌ File not found: ${filePath}`); process.exit(1); } console.log(`\n🔧 Configuration:`); console.log(` File: ${filePath}`); console.log(` Batch size: ${parsedArgs.batchSize}`); console.log(` Resume: ${parsedArgs.resume}`); console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`); if (parsedArgs.stateFile) { console.log(` State file: ${parsedArgs.stateFile}`); } console.log(''); indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);