Files
hasher/scripts/index-file.ts
2025-12-07 01:28:37 +01:00

527 líneas
17 KiB
JavaScript

Este archivo contiene caracteres Unicode invisibles
Este archivo contiene caracteres Unicode invisibles que son indistinguibles para los humanos, pero que pueden ser procesados de forma diferente por un ordenador. Si crees que esto es intencional, puedes ignorar esta advertencia. Usa el botón de Escape para revelarlos.
#!/usr/bin/env node
/**
* Hasher Indexer Script
*
* This script reads a text file with one word/phrase per line and indexes
* all the generated hashes into Elasticsearch.
*
* Usage:
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
* npm run index-file -- <path-to-file.txt> [options]
*
* Options:
* --batch-size=<number> Number of items to process in each batch (default: 100)
* --resume Resume from last saved state (default: true)
* --no-resume Start from beginning, ignore saved state
* --no-check Skip duplicate checking (faster, but may create duplicates)
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
* --help, -h Show this help message
*/
import { Client } from '@elastic/elasticsearch';
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
import { resolve, basename } from 'path';
import { createInterface } from 'readline';
import crypto from 'crypto';
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 100;
interface HashDocument {
plaintext: string;
md5: string;
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
created_at: string;
}
interface IndexerState {
filePath: string;
fileHash: string;
lastProcessedLine: number;
totalLines: number;
indexed: number;
skipped: number;
errors: number;
startTime: number;
lastUpdate: string;
}
interface ParsedArgs {
filePath: string | null;
batchSize: number;
resume: boolean;
checkDuplicates: boolean;
stateFile: string | null;
showHelp: boolean;
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
filePath: null,
batchSize: DEFAULT_BATCH_SIZE,
resume: true,
checkDuplicates: true,
stateFile: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--resume') {
result.resume = true;
} else if (arg === '--no-resume') {
result.resume = false;
} else if (arg === '--no-check') {
result.checkDuplicates = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
// Support --batch-size <value> format
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++; // Skip next argument
}
}
} else if (arg.startsWith('--state-file=')) {
result.stateFile = arg.split('=')[1];
} else if (arg === '--state-file') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.stateFile = nextArg;
i++;
}
} else if (!arg.startsWith('-')) {
// Positional argument - treat as file path
result.filePath = arg;
}
}
return result;
}
function getFileHash(filePath: string): string {
// Create a hash based on file path and size for quick identification
const stats = require('fs').statSync(filePath);
const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`;
return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8);
}
function getDefaultStateFile(filePath: string): string {
const fileName = basename(filePath).replace(/\.[^.]+$/, '');
return resolve(`.indexer-state-${fileName}.json`);
}
function loadState(stateFile: string): IndexerState | null {
try {
if (existsSync(stateFile)) {
const data = readFileSync(stateFile, 'utf-8');
return JSON.parse(data) as IndexerState;
}
} catch (error) {
console.warn(`⚠️ Could not load state file: ${error}`);
}
return null;
}
function saveState(stateFile: string, state: IndexerState): void {
try {
state.lastUpdate = new Date().toISOString();
writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8');
} catch (error) {
console.error(`❌ Could not save state file: ${error}`);
}
}
function deleteState(stateFile: string): void {
try {
if (existsSync(stateFile)) {
unlinkSync(stateFile);
}
} catch (error) {
console.warn(`⚠️ Could not delete state file: ${error}`);
}
}
async function generateHashes(plaintext: string): Promise<HashDocument> {
const bcrypt = await import('bcrypt');
const bcryptHash = await bcrypt.default.hash(plaintext, 10);
return {
plaintext,
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
bcrypt: bcryptHash,
created_at: new Date().toISOString()
};
}
function showHelp() {
console.log(`
Hasher Indexer Script
Usage:
npx tsx scripts/index-file.ts <path-to-file.txt> [options]
npm run index-file -- <path-to-file.txt> [options]
Options:
--batch-size=<number> Number of items to process in each batch (default: 100)
--batch-size <number> Alternative syntax for batch size
--resume Resume from last saved state (default)
--no-resume Start from beginning, ignore saved state
--no-check Skip duplicate checking (faster, but may create duplicates)
--state-file=<path> Custom state file path
--help, -h Show this help message
Environment Variables:
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
Examples:
npx tsx scripts/index-file.ts wordlist.txt
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
npx tsx scripts/index-file.ts wordlist.txt --no-resume
npx tsx scripts/index-file.ts wordlist.txt --no-check
npm run index-file -- wordlist.txt --batch-size=500 --no-check
State Management:
The script automatically saves progress to a state file. If interrupted,
it will resume from where it left off on the next run. Use --no-resume
to start fresh.
Duplicate Checking:
By default, the script checks if each plaintext or hash already exists
in the index before inserting. Use --no-check to skip this verification
for faster indexing (useful when you're sure there are no duplicates).
`);
process.exit(0);
}
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
const client = new Client({ node: ELASTICSEARCH_NODE });
const absolutePath = resolve(filePath);
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
const fileHash = getFileHash(absolutePath);
// State management
let state: IndexerState = {
filePath: absolutePath,
fileHash,
lastProcessedLine: 0,
totalLines: 0,
indexed: 0,
skipped: 0,
errors: 0,
startTime: Date.now(),
lastUpdate: new Date().toISOString()
};
// Check for existing state
const existingState = loadState(stateFile);
let resumingFrom = 0;
if (shouldResume && existingState) {
if (existingState.fileHash === fileHash) {
state = existingState;
resumingFrom = state.lastProcessedLine;
state.startTime = Date.now(); // Reset start time for this session
console.log(`📂 Found existing state, resuming from line ${resumingFrom}`);
} else {
console.log(`⚠️ File has changed since last run, starting fresh`);
deleteState(stateFile);
}
} else if (!shouldResume) {
deleteState(stateFile);
}
console.log(`📚 Hasher Indexer`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`File: ${filePath}`);
console.log(`Batch size: ${batchSize}`);
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
console.log(`State file: ${stateFile}`);
if (resumingFrom > 0) {
console.log(`Resuming from: line ${resumingFrom}`);
console.log(`Already indexed: ${state.indexed}`);
console.log(`Already skipped: ${state.skipped}`);
}
console.log('');
// Handle interruption signals
let isInterrupted = false;
const handleInterrupt = () => {
if (isInterrupted) {
console.log('\n\n⚡ Force quit. Progress saved.');
process.exit(1);
}
isInterrupted = true;
console.log('\n\n⏸ Interrupted! Saving state...');
saveState(stateFile, state);
console.log(`💾 State saved to ${stateFile}`);
console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`);
console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`);
process.exit(0);
};
process.on('SIGINT', handleInterrupt);
process.on('SIGTERM', handleInterrupt);
try {
// Test connection
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('✅ Connected successfully\n');
// Process file line by line using streams
console.log('📖 Processing file...\n');
let currentLineNumber = 0;
let currentBatch: string[] = [];
let sessionIndexed = 0;
let sessionSkipped = 0;
let sessionErrors = 0;
const sessionStartTime = Date.now();
const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' });
const rl = createInterface({
input: fileStream,
crlfDelay: Infinity
});
const processBatch = async (batch: string[], lineNumber: number) => {
if (batch.length === 0) return;
if (isInterrupted) return;
const bulkOperations: any[] = [];
// Generate hashes for all items in batch first
const batchWithHashes = await Promise.all(
batch.map(async (plaintext: string) => ({
plaintext,
hashes: await generateHashes(plaintext)
}))
);
if (checkDuplicates) {
// Check which items already exist (by plaintext or any hash)
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
const existingCheck = await client.search({
index: INDEX_NAME,
size: batchSize * 5,
query: {
bool: {
should: [
{ terms: { 'plaintext.keyword': batch } },
{ terms: { md5: md5List } },
{ terms: { sha1: sha1List } },
{ terms: { sha256: sha256List } },
{ terms: { sha512: sha512List } },
],
minimum_should_match: 1
}
},
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
});
// Create a set of existing hashes for quick lookup
const existingHashes = new Set<string>();
existingCheck.hits.hits.forEach((hit: any) => {
const src = hit._source;
existingHashes.add(src.plaintext);
existingHashes.add(src.md5);
existingHashes.add(src.sha1);
existingHashes.add(src.sha256);
existingHashes.add(src.sha512);
});
// Prepare bulk operations only for items that don't have any duplicate hash
for (const item of batchWithHashes) {
const isDuplicate =
existingHashes.has(item.plaintext) ||
existingHashes.has(item.hashes.md5) ||
existingHashes.has(item.hashes.sha1) ||
existingHashes.has(item.hashes.sha256) ||
existingHashes.has(item.hashes.sha512);
if (!isDuplicate) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
} else {
state.skipped++;
sessionSkipped++;
}
}
} else {
// No duplicate checking - index everything
for (const item of batchWithHashes) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
}
}
// Execute bulk operation only if there are new items to insert
if (bulkOperations.length > 0) {
try {
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
state.errors += errorCount;
sessionErrors += errorCount;
const successCount = (bulkOperations.length / 2) - errorCount;
state.indexed += successCount;
sessionIndexed += successCount;
} else {
const count = bulkOperations.length / 2;
state.indexed += count;
sessionIndexed += count;
}
} catch (error) {
console.error(`\n❌ Error processing batch:`, error);
const count = bulkOperations.length / 2;
state.errors += count;
sessionErrors += count;
}
}
// Update state
state.lastProcessedLine = lineNumber;
state.totalLines = lineNumber;
// Save state periodically (every 10 batches)
if (lineNumber % (batchSize * 10) === 0) {
saveState(stateFile, state);
}
// Progress indicator
const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0);
process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`);
};
for await (const line of rl) {
if (isInterrupted) break;
currentLineNumber++;
// Skip already processed lines
if (currentLineNumber <= resumingFrom) {
continue;
}
const trimmedLine = line.trim();
if (trimmedLine.length > 0) {
// Only take first word (no spaces or separators)
const firstWord = trimmedLine.split(/\s+/)[0];
if (firstWord) {
currentBatch.push(firstWord);
if (currentBatch.length >= batchSize) {
await processBatch(currentBatch, currentLineNumber);
currentBatch = [];
}
}
}
}
// Process remaining items in last batch
if (currentBatch.length > 0 && !isInterrupted) {
await processBatch(currentBatch, currentLineNumber);
}
if (isInterrupted) {
return;
}
// Refresh index
console.log('\n\n🔄 Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
// Delete state file on successful completion
deleteState(stateFile);
const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0';
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Indexing complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Total lines processed: ${currentLineNumber}`);
if (resumingFrom > 0) {
console.log(`Lines skipped (resumed): ${resumingFrom}`);
console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`);
}
console.log(`Successfully indexed (total): ${state.indexed}`);
console.log(`Successfully indexed (session): ${sessionIndexed}`);
console.log(`Skipped duplicates (total): ${state.skipped}`);
console.log(`Skipped duplicates (session): ${sessionSkipped}`);
console.log(`Errors (total): ${state.errors}`);
console.log(`Session duration: ${duration}s`);
console.log(`Session rate: ${rate} docs/sec`);
console.log('');
} catch (error) {
// Save state on error
saveState(stateFile, state);
console.error(`\n💾 State saved to ${stateFile}`);
console.error('❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
} finally {
// Remove signal handlers
process.removeListener('SIGINT', handleInterrupt);
process.removeListener('SIGTERM', handleInterrupt);
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);
if (parsedArgs.showHelp || !parsedArgs.filePath) {
showHelp();
}
const filePath = parsedArgs.filePath as string;
// Validate file exists
if (!existsSync(filePath)) {
console.error(`❌ File not found: ${filePath}`);
process.exit(1);
}
console.log(`\n🔧 Configuration:`);
console.log(` File: ${filePath}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
console.log(` Resume: ${parsedArgs.resume}`);
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
if (parsedArgs.stateFile) {
console.log(` State file: ${parsedArgs.stateFile}`);
}
console.log('');
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);