hasher/scripts/index-file.ts

#!/usr/bin/env node

/**
 * Hasher Indexer Script
 *
 * This script reads a text file with one word/phrase per line and indexes
 * all the generated hashes into Elasticsearch.
 *
 * Usage:
 *   npx tsx scripts/index-file.ts <path-to-file.txt> [options]
 *   npm run index-file -- <path-to-file.txt> [options]
 *
 * Options:
 *   --batch-size=<number>  Number of items to process in each batch (default: 100)
 *   --resume               Resume from last saved state (default: true)
 *   --no-resume            Start from beginning, ignore saved state
 *   --no-check             Skip duplicate checking (faster, but may create duplicates)
 *   --state-file=<path>    Custom state file path (default: .indexer-state-<filename>.json)
 *   --help, -h             Show this help message
 */

import { Client } from '@elastic/elasticsearch';
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
import { resolve, basename } from 'path';
import { createInterface } from 'readline';
import crypto from 'crypto';

const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 100;

interface HashDocument {
  plaintext: string;
  md5: string;
  sha1: string;
  sha256: string;
  sha512: string;
  bcrypt: string;
  created_at: string;
}

interface IndexerState {
  filePath: string;
  fileHash: string;
  lastProcessedLine: number;
  totalLines: number;
  indexed: number;
  skipped: number;
  errors: number;
  startTime: number;
  lastUpdate: string;
}

interface ParsedArgs {
  filePath: string | null;
  batchSize: number;
  resume: boolean;
  checkDuplicates: boolean;
  stateFile: string | null;
  showHelp: boolean;
}

function parseArgs(args: string[]): ParsedArgs {
  const result: ParsedArgs = {
    filePath: null,
    batchSize: DEFAULT_BATCH_SIZE,
    resume: true,
    checkDuplicates: true,
    stateFile: null,
    showHelp: false
  };

  for (let i = 0; i < args.length; i++) {
    const arg = args[i];

    if (arg === '--help' || arg === '-h') {
      result.showHelp = true;
    } else if (arg === '--resume') {
      result.resume = true;
    } else if (arg === '--no-resume') {
      result.resume = false;
    } else if (arg === '--no-check') {
      result.checkDuplicates = false;
    } else if (arg.startsWith('--batch-size=')) {
      const value = arg.split('=')[1];
      const parsed = parseInt(value, 10);
      if (!isNaN(parsed) && parsed > 0) {
        result.batchSize = parsed;
      }
    } else if (arg === '--batch-size') {
      // Support --batch-size <value> format
      const nextArg = args[i + 1];
      if (nextArg && !nextArg.startsWith('-')) {
        const parsed = parseInt(nextArg, 10);
        if (!isNaN(parsed) && parsed > 0) {
          result.batchSize = parsed;
          i++; // Skip next argument
        }
      }
    } else if (arg.startsWith('--state-file=')) {
      result.stateFile = arg.split('=')[1];
    } else if (arg === '--state-file') {
      const nextArg = args[i + 1];
      if (nextArg && !nextArg.startsWith('-')) {
        result.stateFile = nextArg;
        i++;
      }
    } else if (!arg.startsWith('-')) {
      // Positional argument - treat as file path
      result.filePath = arg;
    }
  }

  return result;
}

function getFileHash(filePath: string): string {
  // Create a hash based on file path and size for quick identification
  const stats = require('fs').statSync(filePath);
  const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`;
  return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8);
}

function getDefaultStateFile(filePath: string): string {
  const fileName = basename(filePath).replace(/\.[^.]+$/, '');
  return resolve(`.indexer-state-${fileName}.json`);
}

function loadState(stateFile: string): IndexerState | null {
  try {
    if (existsSync(stateFile)) {
      const data = readFileSync(stateFile, 'utf-8');
      return JSON.parse(data) as IndexerState;
    }
  } catch (error) {
    console.warn(`⚠️  Could not load state file: ${error}`);
  }
  return null;
}

function saveState(stateFile: string, state: IndexerState): void {
  try {
    state.lastUpdate = new Date().toISOString();
    writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8');
  } catch (error) {
    console.error(`❌ Could not save state file: ${error}`);
  }
}

function deleteState(stateFile: string): void {
  try {
    if (existsSync(stateFile)) {
      unlinkSync(stateFile);
    }
  } catch (error) {
    console.warn(`⚠️  Could not delete state file: ${error}`);
  }
}

async function generateHashes(plaintext: string): Promise<HashDocument> {
  const bcrypt = await import('bcrypt');
  const bcryptHash = await bcrypt.default.hash(plaintext, 10);

  return {
    plaintext,
    md5: crypto.createHash('md5').update(plaintext).digest('hex'),
    sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
    sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
    sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
    bcrypt: bcryptHash,
    created_at: new Date().toISOString()
  };
}

function showHelp() {
  console.log(`
Hasher Indexer Script

Usage:
  npx tsx scripts/index-file.ts <path-to-file.txt> [options]
  npm run index-file -- <path-to-file.txt> [options]

Options:
  --batch-size=<number>  Number of items to process in each batch (default: 100)
  --batch-size <number>  Alternative syntax for batch size
  --resume               Resume from last saved state (default)
  --no-resume            Start from beginning, ignore saved state
  --no-check             Skip duplicate checking (faster, but may create duplicates)
  --state-file=<path>    Custom state file path
  --help, -h             Show this help message

Environment Variables:
  ELASTICSEARCH_NODE     Elasticsearch node URL (default: http://localhost:9200)

Examples:
  npx tsx scripts/index-file.ts wordlist.txt
  npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
  npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
  npx tsx scripts/index-file.ts wordlist.txt --no-resume
  npx tsx scripts/index-file.ts wordlist.txt --no-check
  npm run index-file -- wordlist.txt --batch-size=500 --no-check

State Management:
  The script automatically saves progress to a state file. If interrupted,
  it will resume from where it left off on the next run. Use --no-resume
  to start fresh.

Duplicate Checking:
  By default, the script checks if each plaintext or hash already exists
  in the index before inserting. Use --no-check to skip this verification
  for faster indexing (useful when you're sure there are no duplicates).
`);
  process.exit(0);
}

async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
  const client = new Client({ node: ELASTICSEARCH_NODE });
  const absolutePath = resolve(filePath);
  const stateFile = customStateFile || getDefaultStateFile(absolutePath);
  const fileHash = getFileHash(absolutePath);

  // State management
  let state: IndexerState = {
    filePath: absolutePath,
    fileHash,
    lastProcessedLine: 0,
    totalLines: 0,
    indexed: 0,
    skipped: 0,
    errors: 0,
    startTime: Date.now(),
    lastUpdate: new Date().toISOString()
  };

  // Check for existing state
  const existingState = loadState(stateFile);
  let resumingFrom = 0;

  if (shouldResume && existingState) {
    if (existingState.fileHash === fileHash) {
      state = existingState;
      resumingFrom = state.lastProcessedLine;
      state.startTime = Date.now(); // Reset start time for this session
      console.log(`📂 Found existing state, resuming from line ${resumingFrom}`);
    } else {
      console.log(`⚠️  File has changed since last run, starting fresh`);
      deleteState(stateFile);
    }
  } else if (!shouldResume) {
    deleteState(stateFile);
  }

  console.log(`📚 Hasher Indexer`);
  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
  console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
  console.log(`Index: ${INDEX_NAME}`);
  console.log(`File: ${filePath}`);
  console.log(`Batch size: ${batchSize}`);
  console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
  console.log(`State file: ${stateFile}`);
  if (resumingFrom > 0) {
    console.log(`Resuming from: line ${resumingFrom}`);
    console.log(`Already indexed: ${state.indexed}`);
    console.log(`Already skipped: ${state.skipped}`);
  }
  console.log('');

  // Handle interruption signals
  let isInterrupted = false;
  const handleInterrupt = () => {
    if (isInterrupted) {
      console.log('\n\n⚡ Force quit. Progress saved.');
      process.exit(1);
    }
    isInterrupted = true;
    console.log('\n\n⏸️  Interrupted! Saving state...');
    saveState(stateFile, state);
    console.log(`💾 State saved to ${stateFile}`);
    console.log(`   Resume with: npx tsx scripts/index-file.ts ${filePath}`);
    console.log(`   Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`);
    process.exit(0);
  };

  process.on('SIGINT', handleInterrupt);
  process.on('SIGTERM', handleInterrupt);

  try {
    // Test connection
    console.log('🔗 Connecting to Elasticsearch...');
    await client.cluster.health({});
    console.log('✅ Connected successfully\n');

    // Process file line by line using streams
    console.log('📖 Processing file...\n');

    let currentLineNumber = 0;
    let currentBatch: string[] = [];
    let sessionIndexed = 0;
    let sessionSkipped = 0;
    let sessionErrors = 0;
    const sessionStartTime = Date.now();

    const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' });
    const rl = createInterface({
      input: fileStream,
      crlfDelay: Infinity
    });

    const processBatch = async (batch: string[], lineNumber: number) => {
      if (batch.length === 0) return;
      if (isInterrupted) return;

      const bulkOperations: any[] = [];

      // Generate hashes for all items in batch first
      const batchWithHashes = await Promise.all(
        batch.map(async (plaintext: string) => ({
          plaintext,
          hashes: await generateHashes(plaintext)
        }))
      );

      if (checkDuplicates) {
        // Check which items already exist (by plaintext or any hash)
        const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
        const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
        const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
        const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);

        const existingCheck = await client.search({
          index: INDEX_NAME,
          size: batchSize * 5,
          query: {
            bool: {
              should: [
                { terms: { 'plaintext.keyword': batch } },
                { terms: { md5: md5List } },
                { terms: { sha1: sha1List } },
                { terms: { sha256: sha256List } },
                { terms: { sha512: sha512List } },
              ],
              minimum_should_match: 1
            }
          },
          _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
        });

        // Create a set of existing hashes for quick lookup
        const existingHashes = new Set<string>();
        existingCheck.hits.hits.forEach((hit: any) => {
          const src = hit._source;
          existingHashes.add(src.plaintext);
          existingHashes.add(src.md5);
          existingHashes.add(src.sha1);
          existingHashes.add(src.sha256);
          existingHashes.add(src.sha512);
        });

        // Prepare bulk operations only for items that don't have any duplicate hash
        for (const item of batchWithHashes) {
          const isDuplicate =
            existingHashes.has(item.plaintext) ||
            existingHashes.has(item.hashes.md5) ||
            existingHashes.has(item.hashes.sha1) ||
            existingHashes.has(item.hashes.sha256) ||
            existingHashes.has(item.hashes.sha512);

          if (!isDuplicate) {
            bulkOperations.push({ index: { _index: INDEX_NAME } });
            bulkOperations.push(item.hashes);
          } else {
            state.skipped++;
            sessionSkipped++;
          }
        }
      } else {
        // No duplicate checking - index everything
        for (const item of batchWithHashes) {
          bulkOperations.push({ index: { _index: INDEX_NAME } });
          bulkOperations.push(item.hashes);
        }
      }

      // Execute bulk operation only if there are new items to insert
      if (bulkOperations.length > 0) {
        try {
          const bulkResponse = await client.bulk({
            operations: bulkOperations,
            refresh: false
          });

          if (bulkResponse.errors) {
            const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
            state.errors += errorCount;
            sessionErrors += errorCount;
            const successCount = (bulkOperations.length / 2) - errorCount;
            state.indexed += successCount;
            sessionIndexed += successCount;
          } else {
            const count = bulkOperations.length / 2;
            state.indexed += count;
            sessionIndexed += count;
          }
        } catch (error) {
          console.error(`\n❌ Error processing batch:`, error);
          const count = bulkOperations.length / 2;
          state.errors += count;
          sessionErrors += count;
        }
      }

      // Update state
      state.lastProcessedLine = lineNumber;
      state.totalLines = lineNumber;

      // Save state periodically (every 10 batches)
      if (lineNumber % (batchSize * 10) === 0) {
        saveState(stateFile, state);
      }

      // Progress indicator
      const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0);
      process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`);
    };

    for await (const line of rl) {
      if (isInterrupted) break;

      currentLineNumber++;

      // Skip already processed lines
      if (currentLineNumber <= resumingFrom) {
        continue;
      }

      const trimmedLine = line.trim();
      if (trimmedLine.length > 0) {
        // Only take first word (no spaces or separators)
        const firstWord = trimmedLine.split(/\s+/)[0];
        if (firstWord) {
          currentBatch.push(firstWord);

          if (currentBatch.length >= batchSize) {
            await processBatch(currentBatch, currentLineNumber);
            currentBatch = [];
          }
        }
      }
    }

    // Process remaining items in last batch
    if (currentBatch.length > 0 && !isInterrupted) {
      await processBatch(currentBatch, currentLineNumber);
    }

    if (isInterrupted) {
      return;
    }

    // Refresh index
    console.log('\n\n🔄 Refreshing index...');
    await client.indices.refresh({ index: INDEX_NAME });

    // Delete state file on successful completion
    deleteState(stateFile);

    const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
    const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0';

    console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('✅ Indexing complete!');
    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
    console.log(`Total lines processed: ${currentLineNumber}`);
    if (resumingFrom > 0) {
      console.log(`Lines skipped (resumed): ${resumingFrom}`);
      console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`);
    }
    console.log(`Successfully indexed (total): ${state.indexed}`);
    console.log(`Successfully indexed (session): ${sessionIndexed}`);
    console.log(`Skipped duplicates (total): ${state.skipped}`);
    console.log(`Skipped duplicates (session): ${sessionSkipped}`);
    console.log(`Errors (total): ${state.errors}`);
    console.log(`Session duration: ${duration}s`);
    console.log(`Session rate: ${rate} docs/sec`);
    console.log('');

  } catch (error) {
    // Save state on error
    saveState(stateFile, state);
    console.error(`\n💾 State saved to ${stateFile}`);
    console.error('❌ Error:', error instanceof Error ? error.message : error);
    process.exit(1);
  } finally {
    // Remove signal handlers
    process.removeListener('SIGINT', handleInterrupt);
    process.removeListener('SIGTERM', handleInterrupt);
  }
}

// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);

if (parsedArgs.showHelp || !parsedArgs.filePath) {
  showHelp();
}

const filePath = parsedArgs.filePath as string;

// Validate file exists
if (!existsSync(filePath)) {
  console.error(`❌ File not found: ${filePath}`);
  process.exit(1);
}

console.log(`\n🔧 Configuration:`);
console.log(`   File: ${filePath}`);
console.log(`   Batch size: ${parsedArgs.batchSize}`);
console.log(`   Resume: ${parsedArgs.resume}`);
console.log(`   Check duplicates: ${parsedArgs.checkDuplicates}`);
if (parsedArgs.stateFile) {
  console.log(`   State file: ${parsedArgs.stateFile}`);
}
console.log('');

indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);