Files
hasher/scripts/index-file.ts
2025-12-15 16:35:35 +01:00

522 líneas
17 KiB
JavaScript

Este archivo contiene caracteres Unicode invisibles
Este archivo contiene caracteres Unicode invisibles que son indistinguibles para los humanos, pero que pueden ser procesados de forma diferente por un ordenador. Si crees que esto es intencional, puedes ignorar esta advertencia. Usa el botón de Escape para revelarlos.
#!/usr/bin/env node
/**
* Hasher Indexer Script
*
* This script reads a text file with one word/phrase per line and indexes
* all the generated hashes into Redis.
*
* Usage:
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
* npm run index-file -- <path-to-file.txt> [options]
*
* Options:
* --batch-size=<number> Number of items to process in each batch (default: 100)
* --resume Resume from last saved state (default: true)
* --no-resume Start from beginning, ignore saved state
* --no-check Skip duplicate checking (faster, but may create duplicates)
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
* --help, -h Show this help message
*/
import Redis from 'ioredis';
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
import { resolve, basename } from 'path';
import { createInterface } from 'readline';
import crypto from 'crypto';
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 100;
interface HashDocument {
plaintext: string;
md5: string;
sha1: string;
sha256: string;
sha512: string;
created_at: string;
}
interface IndexerState {
filePath: string;
fileHash: string;
lastProcessedLine: number;
totalLines: number;
indexed: number;
skipped: number;
errors: number;
startTime: number;
lastUpdate: string;
}
interface ParsedArgs {
filePath: string | null;
batchSize: number;
resume: boolean;
checkDuplicates: boolean;
stateFile: string | null;
showHelp: boolean;
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
filePath: null,
batchSize: DEFAULT_BATCH_SIZE,
resume: true,
checkDuplicates: true,
stateFile: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--resume') {
result.resume = true;
} else if (arg === '--no-resume') {
result.resume = false;
} else if (arg === '--no-check') {
result.checkDuplicates = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
// Support --batch-size <value> format
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++; // Skip next argument
}
}
} else if (arg.startsWith('--state-file=')) {
result.stateFile = arg.split('=')[1];
} else if (arg === '--state-file') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.stateFile = nextArg;
i++;
}
} else if (!arg.startsWith('-')) {
// Positional argument - treat as file path
result.filePath = arg;
}
}
return result;
}
function getFileHash(filePath: string): string {
// Create a hash based on file path and size for quick identification
const stats = require('fs').statSync(filePath);
const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`;
return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8);
}
function getDefaultStateFile(filePath: string): string {
const fileName = basename(filePath).replace(/\.[^.]+$/, '');
return resolve(`.indexer-state-${fileName}.json`);
}
function loadState(stateFile: string): IndexerState | null {
try {
if (existsSync(stateFile)) {
const data = readFileSync(stateFile, 'utf-8');
return JSON.parse(data) as IndexerState;
}
} catch (error) {
console.warn(`⚠️ Could not load state file: ${error}`);
}
return null;
}
function saveState(stateFile: string, state: IndexerState): void {
try {
state.lastUpdate = new Date().toISOString();
writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8');
} catch (error) {
console.error(`❌ Could not save state file: ${error}`);
}
}
function deleteState(stateFile: string): void {
try {
if (existsSync(stateFile)) {
unlinkSync(stateFile);
}
} catch (error) {
console.warn(`⚠️ Could not delete state file: ${error}`);
}
}
async function generateHashes(plaintext: string): Promise<HashDocument> {
return {
plaintext,
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
created_at: new Date().toISOString()
};
}
function showHelp() {
console.log(`
Hasher Indexer Script
Usage:
npx tsx scripts/index-file.ts <path-to-file.txt> [options]
npm run index-file -- <path-to-file.txt> [options]
Options:
--batch-size=<number> Number of items to process in each batch (default: 100)
--batch-size <number> Alternative syntax for batch size
--resume Resume from last saved state (default)
--no-resume Start from beginning, ignore saved state
--no-check Skip duplicate checking (faster, but may create duplicates)
--state-file=<path> Custom state file path
--help, -h Show this help message
Environment Variables:
REDIS_HOST Redis host (default: localhost)
REDIS_PORT Redis port (default: 6379)
REDIS_PASSWORD Redis password (optional)
REDIS_DB Redis database number (default: 0)
Examples:
npx tsx scripts/index-file.ts wordlist.txt
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
npx tsx scripts/index-file.ts wordlist.txt --no-resume
npx tsx scripts/index-file.ts wordlist.txt --no-check
npm run index-file -- wordlist.txt --batch-size=500 --no-check
State Management:
The script automatically saves progress to a state file. If interrupted,
it will resume from where it left off on the next run. Use --no-resume
to start fresh.
Duplicate Checking:
By default, the script checks if each plaintext or hash already exists
in the index before inserting. Use --no-check to skip this verification
for faster indexing (useful when you're sure there are no duplicates).
`);
process.exit(0);
}
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
const client = new Redis({
host: REDIS_HOST,
port: REDIS_PORT,
password: REDIS_PASSWORD,
db: REDIS_DB,
retryStrategy: (times) => Math.min(times * 50, 2000),
});
const absolutePath = resolve(filePath);
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
const fileHash = getFileHash(absolutePath);
// State management
let state: IndexerState = {
filePath: absolutePath,
fileHash,
lastProcessedLine: 0,
totalLines: 0,
indexed: 0,
skipped: 0,
errors: 0,
startTime: Date.now(),
lastUpdate: new Date().toISOString()
};
// Check for existing state
const existingState = loadState(stateFile);
let resumingFrom = 0;
if (shouldResume && existingState) {
if (existingState.fileHash === fileHash) {
state = existingState;
resumingFrom = state.lastProcessedLine;
state.startTime = Date.now(); // Reset start time for this session
console.log(`📂 Found existing state, resuming from line ${resumingFrom}`);
} else {
console.log(`⚠️ File has changed since last run, starting fresh`);
deleteState(stateFile);
}
} else if (!shouldResume) {
deleteState(stateFile);
}
console.log(`📚 Hasher Indexer`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT} (DB ${REDIS_DB})`);
console.log(`Index: ${INDEX_NAME}`);
console.log(`File: ${filePath}`);
console.log(`Batch size: ${batchSize}`);
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
console.log(`State file: ${stateFile}`);
if (resumingFrom > 0) {
console.log(`Resuming from: line ${resumingFrom}`);
console.log(`Already indexed: ${state.indexed}`);
console.log(`Already skipped: ${state.skipped}`);
}
console.log('');
// Handle interruption signals
let isInterrupted = false;
const handleInterrupt = () => {
if (isInterrupted) {
console.log('\n\n⚡ Force quit. Progress saved.');
process.exit(1);
}
isInterrupted = true;
console.log('\n\n⏸ Interrupted! Saving state...');
saveState(stateFile, state);
console.log(`💾 State saved to ${stateFile}`);
console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`);
console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`);
process.exit(0);
};
process.on('SIGINT', handleInterrupt);
process.on('SIGTERM', handleInterrupt);
try {
// Test connection
console.log('🔗 Connecting to Redis...');
await client.ping();
console.log('✅ Connected successfully\n');
// Process file line by line using streams
console.log('📖 Processing file...\n');
let currentLineNumber = 0;
let currentBatch: string[] = [];
let sessionIndexed = 0;
let sessionSkipped = 0;
let sessionErrors = 0;
const sessionStartTime = Date.now();
const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' });
const rl = createInterface({
input: fileStream,
crlfDelay: Infinity
});
const processBatch = async (batch: string[], lineNumber: number) => {
if (batch.length === 0) return;
if (isInterrupted) return;
// Generate hashes for all items in batch first
const batchWithHashes = await Promise.all(
batch.map(async (plaintext: string) => ({
plaintext,
hashes: await generateHashes(plaintext)
}))
);
const pipeline = client.pipeline();
let toIndex: typeof batchWithHashes = [];
if (checkDuplicates) {
// Check which items already exist
const existenceChecks = await Promise.all(
batchWithHashes.map(async (item) => {
const plaintextExists = await client.exists(`hash:plaintext:${item.plaintext}`);
if (plaintextExists) return { item, exists: true };
// Check if any hash exists
const md5Exists = await client.exists(`hash:index:md5:${item.hashes.md5}`);
const sha1Exists = await client.exists(`hash:index:sha1:${item.hashes.sha1}`);
const sha256Exists = await client.exists(`hash:index:sha256:${item.hashes.sha256}`);
const sha512Exists = await client.exists(`hash:index:sha512:${item.hashes.sha512}`);
return {
item,
exists: md5Exists || sha1Exists || sha256Exists || sha512Exists
};
})
);
for (const check of existenceChecks) {
if (check.exists) {
state.skipped++;
sessionSkipped++;
} else {
toIndex.push(check.item);
}
}
} else {
// No duplicate checking - index everything
toIndex = batchWithHashes;
}
// Execute bulk operations
if (toIndex.length > 0) {
try {
for (const item of toIndex) {
const doc = item.hashes;
const key = `hash:plaintext:${doc.plaintext}`;
// Store main document
pipeline.set(key, JSON.stringify(doc));
// Create indexes for each hash type
pipeline.set(`hash:index:md5:${doc.md5}`, doc.plaintext);
pipeline.set(`hash:index:sha1:${doc.sha1}`, doc.plaintext);
pipeline.set(`hash:index:sha256:${doc.sha256}`, doc.plaintext);
pipeline.set(`hash:index:sha512:${doc.sha512}`, doc.plaintext);
// Update statistics
pipeline.hincrby('hash:stats', 'count', 1);
pipeline.hincrby('hash:stats', 'size', JSON.stringify(doc).length);
}
const results = await pipeline.exec();
// Count errors
const errorCount = results?.filter(([err]) => err !== null).length || 0;
if (errorCount > 0) {
state.errors += errorCount;
sessionErrors += errorCount;
const successCount = toIndex.length - errorCount;
state.indexed += successCount;
sessionIndexed += successCount;
} else {
state.indexed += toIndex.length;
sessionIndexed += toIndex.length;
}
} catch (error) {
console.error(`\n❌ Error processing batch:`, error);
state.errors += toIndex.length;
sessionErrors += toIndex.length;
}
}
// Update state
state.lastProcessedLine = lineNumber;
state.totalLines = lineNumber;
// Save state periodically (every 10 batches)
if (lineNumber % (batchSize * 10) === 0) {
saveState(stateFile, state);
}
// Progress indicator
const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0);
process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`);
};
for await (const line of rl) {
if (isInterrupted) break;
currentLineNumber++;
// Skip already processed lines
if (currentLineNumber <= resumingFrom) {
continue;
}
const trimmedLine = line.trim();
if (trimmedLine.length > 0) {
// Only take first word (no spaces or separators)
const firstWord = trimmedLine.split(/\s+/)[0];
if (firstWord) {
currentBatch.push(firstWord);
if (currentBatch.length >= batchSize) {
await processBatch(currentBatch, currentLineNumber);
currentBatch = [];
}
}
}
}
// Process remaining items in last batch
if (currentBatch.length > 0 && !isInterrupted) {
await processBatch(currentBatch, currentLineNumber);
}
if (isInterrupted) {
return;
}
// No refresh needed for Redis
console.log('\n\n✅ All data persisted to Redis');
// Delete state file on successful completion
deleteState(stateFile);
const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0';
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Indexing complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Total lines processed: ${currentLineNumber}`);
if (resumingFrom > 0) {
console.log(`Lines skipped (resumed): ${resumingFrom}`);
console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`);
}
console.log(`Successfully indexed (total): ${state.indexed}`);
console.log(`Successfully indexed (session): ${sessionIndexed}`);
console.log(`Skipped duplicates (total): ${state.skipped}`);
console.log(`Skipped duplicates (session): ${sessionSkipped}`);
console.log(`Errors (total): ${state.errors}`);
console.log(`Session duration: ${duration}s`);
console.log(`Session rate: ${rate} docs/sec`);
console.log('');
} catch (error) {
// Save state on error
saveState(stateFile, state);
console.error(`\n💾 State saved to ${stateFile}`);
console.error('❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
} finally {
// Remove signal handlers
process.removeListener('SIGINT', handleInterrupt);
process.removeListener('SIGTERM', handleInterrupt);
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);
if (parsedArgs.showHelp || !parsedArgs.filePath) {
showHelp();
}
const filePath = parsedArgs.filePath as string;
// Validate file exists
if (!existsSync(filePath)) {
console.error(`❌ File not found: ${filePath}`);
process.exit(1);
}
console.log(`\n🔧 Configuration:`);
console.log(` File: ${filePath}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
console.log(` Resume: ${parsedArgs.resume}`);
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
if (parsedArgs.stateFile) {
console.log(` State file: ${parsedArgs.stateFile}`);
}
console.log('');
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);