initial commit

Signed-off-by: ale <ale@manalejandro.com>
2025-12-04 00:58:40 +01:00
commit 1b51f5a171
--- a/scripts/index-file.ts
+++ b/scripts/index-file.ts
@@ -0,0 +1,236 @@
+#!/usr/bin/env node
+
+/**
+ * Hasher Indexer Script
+ * 
+ * This script reads a text file with one word/phrase per line and indexes
+ * all the generated hashes into Elasticsearch.
+ * 
+ * Usage:
+ *   npm run index-file <path-to-file.txt>
+ *   or
+ *   node scripts/index-file.js <path-to-file.txt>
+ * 
+ * Options:
+ *   --batch-size <number>  Number of items to process in each batch (default: 100)
+ *   --help                 Show this help message
+ */
+
+import { Client } from '@elastic/elasticsearch';
+import { readFileSync } from 'fs';
+import { resolve } from 'path';
+import crypto from 'crypto';
+
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const INDEX_NAME = 'hasher';
+const DEFAULT_BATCH_SIZE = 100;
+
+interface HashDocument {
+  plaintext: string;
+  md5: string;
+  sha1: string;
+  sha256: string;
+  sha512: string;
+  bcrypt: string;
+  created_at: string;
+}
+
+async function generateHashes(plaintext: string): Promise<HashDocument> {
+  const bcrypt = await import('bcrypt');
+  const bcryptHash = await bcrypt.default.hash(plaintext, 10);
+  
+  return {
+    plaintext,
+    md5: crypto.createHash('md5').update(plaintext).digest('hex'),
+    sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
+    sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
+    sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
+    bcrypt: bcryptHash,
+    created_at: new Date().toISOString()
+  };
+}
+
+function showHelp() {
+  console.log(`
+Hasher Indexer Script
+
+Usage:
+  npm run index-file <path-to-file.txt>
+  node scripts/index-file.js <path-to-file.txt>
+
+Options:
+  --batch-size <number>  Number of items to process in each batch (default: 100)
+  --help                 Show this help message
+
+Environment Variables:
+  ELASTICSEARCH_NODE     Elasticsearch node URL (default: http://localhost:9200)
+
+Example:
+  npm run index-file wordlist.txt
+  npm run index-file wordlist.txt -- --batch-size 500
+`);
+  process.exit(0);
+}
+
+async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZE) {
+  const client = new Client({ node: ELASTICSEARCH_NODE });
+
+  console.log(`📚 Hasher Indexer`);
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+  console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
+  console.log(`Index: ${INDEX_NAME}`);
+  console.log(`File: ${filePath}`);
+  console.log(`Batch size: ${batchSize}`);
+  console.log('');
+
+  try {
+    // Test connection
+    console.log('🔗 Connecting to Elasticsearch...');
+    await client.cluster.health({});
+    console.log('✅ Connected successfully\n');
+
+    // Read file
+    console.log('📖 Reading file...');
+    const absolutePath = resolve(filePath);
+    const content = readFileSync(absolutePath, 'utf-8');
+    const lines = content.split('\n')
+      .map(line => line.trim())
+      .filter(line => line.length > 0);
+    
+    console.log(`✅ Found ${lines.length} words/phrases to process\n`);
+
+    // Process in batches
+    let indexed = 0;
+    let skipped = 0;
+    let errors = 0;
+    const startTime = Date.now();
+
+    for (let i = 0; i < lines.length; i += batchSize) {
+      const batch = lines.slice(i, i + batchSize);
+      const bulkOperations: any[] = [];
+
+      // Generate hashes for all items in batch first
+      const batchWithHashes = await Promise.all(
+        batch.map(async (plaintext) => ({
+          plaintext,
+          hashes: await generateHashes(plaintext)
+        }))
+      );
+
+      // Check which items already exist (by plaintext or any hash)
+      const md5List = batchWithHashes.map(item => item.hashes.md5);
+      const sha1List = batchWithHashes.map(item => item.hashes.sha1);
+      const sha256List = batchWithHashes.map(item => item.hashes.sha256);
+      const sha512List = batchWithHashes.map(item => item.hashes.sha512);
+
+      const existingCheck = await client.search({
+        index: INDEX_NAME,
+        size: batchSize * 5, // Account for potential multiple matches
+        query: {
+          bool: {
+            should: [
+              { terms: { 'plaintext.keyword': batch } },
+              { terms: { md5: md5List } },
+              { terms: { sha1: sha1List } },
+              { terms: { sha256: sha256List } },
+              { terms: { sha512: sha512List } },
+            ],
+            minimum_should_match: 1
+          }
+        },
+        _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
+      });
+
+      // Create a set of existing hashes for quick lookup
+      const existingHashes = new Set<string>();
+      existingCheck.hits.hits.forEach((hit: any) => {
+        const src = hit._source;
+        existingHashes.add(src.plaintext);
+        existingHashes.add(src.md5);
+        existingHashes.add(src.sha1);
+        existingHashes.add(src.sha256);
+        existingHashes.add(src.sha512);
+      });
+
+      // Prepare bulk operations only for items that don't have any duplicate hash
+      for (const item of batchWithHashes) {
+        const isDuplicate = 
+          existingHashes.has(item.plaintext) ||
+          existingHashes.has(item.hashes.md5) ||
+          existingHashes.has(item.hashes.sha1) ||
+          existingHashes.has(item.hashes.sha256) ||
+          existingHashes.has(item.hashes.sha512);
+
+        if (!isDuplicate) {
+          bulkOperations.push({ index: { _index: INDEX_NAME } });
+          bulkOperations.push(item.hashes);
+        } else {
+          skipped++;
+        }
+      }
+
+      // Execute bulk operation only if there are new items to insert
+      if (bulkOperations.length > 0) {
+        try {
+          const bulkResponse = await client.bulk({
+            operations: bulkOperations,
+            refresh: false
+          });
+
+          if (bulkResponse.errors) {
+            const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
+            errors += errorCount;
+            indexed += (bulkOperations.length / 2) - errorCount;
+          } else {
+            indexed += bulkOperations.length / 2;
+          }
+        } catch (error) {
+          console.error(`\n❌ Error processing batch ${i}-${i + batchSize}:`, error);
+          errors += bulkOperations.length / 2;
+        }
+      }
+
+      // Progress indicator
+      const progress = Math.min(i + batchSize, lines.length);
+      const percent = ((progress / lines.length) * 100).toFixed(1);
+      process.stdout.write(`\r⏳ Progress: ${progress}/${lines.length} (${percent}%) - Indexed: ${indexed}, Skipped: ${skipped}, Errors: ${errors}`);
+    }
+
+    // Refresh index
+    console.log('\n\n🔄 Refreshing index...');
+    await client.indices.refresh({ index: INDEX_NAME });
+
+    const duration = ((Date.now() - startTime) / 1000).toFixed(2);
+    const rate = (indexed / parseFloat(duration)).toFixed(0);
+
+    console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
+    console.log('✅ Indexing complete!');
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+    console.log(`Total processed: ${lines.length}`);
+    console.log(`Successfully indexed: ${indexed}`);
+    console.log(`Skipped (duplicates): ${skipped}`);
+    console.log(`Errors: ${errors}`);
+    console.log(`Duration: ${duration}s`);
+    console.log(`Rate: ${rate} docs/sec`);
+    console.log('');
+
+  } catch (error) {
+    console.error('\n❌ Error:', error instanceof Error ? error.message : error);
+    process.exit(1);
+  }
+}
+
+// Parse command line arguments
+const args = process.argv.slice(2);
+
+if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
+  showHelp();
+}
+
+const filePath = args[0];
+const batchSizeIndex = args.indexOf('--batch-size');
+const batchSize = batchSizeIndex !== -1 && args[batchSizeIndex + 1]
+  ? parseInt(args[batchSizeIndex + 1], 10)
+  : DEFAULT_BATCH_SIZE;
+
+indexFile(filePath, batchSize).catch(console.error);