507 líneas
16 KiB
JavaScript
507 líneas
16 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Hasher Indexer Script
|
||
*
|
||
* This script reads a text file with one word/phrase per line and indexes
|
||
* all the generated hashes into Elasticsearch.
|
||
*
|
||
* Usage:
|
||
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
|
||
* npm run index-file -- <path-to-file.txt> [options]
|
||
*
|
||
* Options:
|
||
* --batch-size=<number> Number of items to process in each batch (default: 100)
|
||
* --resume Resume from last saved state (default: true)
|
||
* --no-resume Start from beginning, ignore saved state
|
||
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
|
||
* --help, -h Show this help message
|
||
*/
|
||
|
||
import { Client } from '@elastic/elasticsearch';
|
||
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
|
||
import { resolve, basename } from 'path';
|
||
import { createInterface } from 'readline';
|
||
import crypto from 'crypto';
|
||
|
||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
||
const INDEX_NAME = 'hasher';
|
||
const DEFAULT_BATCH_SIZE = 100;
|
||
|
||
interface HashDocument {
|
||
plaintext: string;
|
||
md5: string;
|
||
sha1: string;
|
||
sha256: string;
|
||
sha512: string;
|
||
bcrypt: string;
|
||
created_at: string;
|
||
}
|
||
|
||
interface IndexerState {
|
||
filePath: string;
|
||
fileHash: string;
|
||
lastProcessedLine: number;
|
||
totalLines: number;
|
||
indexed: number;
|
||
skipped: number;
|
||
errors: number;
|
||
startTime: number;
|
||
lastUpdate: string;
|
||
}
|
||
|
||
interface ParsedArgs {
|
||
filePath: string | null;
|
||
batchSize: number;
|
||
resume: boolean;
|
||
stateFile: string | null;
|
||
showHelp: boolean;
|
||
}
|
||
|
||
function parseArgs(args: string[]): ParsedArgs {
|
||
const result: ParsedArgs = {
|
||
filePath: null,
|
||
batchSize: DEFAULT_BATCH_SIZE,
|
||
resume: true,
|
||
stateFile: null,
|
||
showHelp: false
|
||
};
|
||
|
||
for (let i = 0; i < args.length; i++) {
|
||
const arg = args[i];
|
||
|
||
if (arg === '--help' || arg === '-h') {
|
||
result.showHelp = true;
|
||
} else if (arg === '--resume') {
|
||
result.resume = true;
|
||
} else if (arg === '--no-resume') {
|
||
result.resume = false;
|
||
} else if (arg.startsWith('--batch-size=')) {
|
||
const value = arg.split('=')[1];
|
||
const parsed = parseInt(value, 10);
|
||
if (!isNaN(parsed) && parsed > 0) {
|
||
result.batchSize = parsed;
|
||
}
|
||
} else if (arg === '--batch-size') {
|
||
// Support --batch-size <value> format
|
||
const nextArg = args[i + 1];
|
||
if (nextArg && !nextArg.startsWith('-')) {
|
||
const parsed = parseInt(nextArg, 10);
|
||
if (!isNaN(parsed) && parsed > 0) {
|
||
result.batchSize = parsed;
|
||
i++; // Skip next argument
|
||
}
|
||
}
|
||
} else if (arg.startsWith('--state-file=')) {
|
||
result.stateFile = arg.split('=')[1];
|
||
} else if (arg === '--state-file') {
|
||
const nextArg = args[i + 1];
|
||
if (nextArg && !nextArg.startsWith('-')) {
|
||
result.stateFile = nextArg;
|
||
i++;
|
||
}
|
||
} else if (!arg.startsWith('-')) {
|
||
// Positional argument - treat as file path
|
||
result.filePath = arg;
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
function getFileHash(filePath: string): string {
|
||
// Create a hash based on file path and size for quick identification
|
||
const stats = require('fs').statSync(filePath);
|
||
const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`;
|
||
return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8);
|
||
}
|
||
|
||
function getDefaultStateFile(filePath: string): string {
|
||
const fileName = basename(filePath).replace(/\.[^.]+$/, '');
|
||
return resolve(`.indexer-state-${fileName}.json`);
|
||
}
|
||
|
||
function loadState(stateFile: string): IndexerState | null {
|
||
try {
|
||
if (existsSync(stateFile)) {
|
||
const data = readFileSync(stateFile, 'utf-8');
|
||
return JSON.parse(data) as IndexerState;
|
||
}
|
||
} catch (error) {
|
||
console.warn(`⚠️ Could not load state file: ${error}`);
|
||
}
|
||
return null;
|
||
}
|
||
|
||
function saveState(stateFile: string, state: IndexerState): void {
|
||
try {
|
||
state.lastUpdate = new Date().toISOString();
|
||
writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8');
|
||
} catch (error) {
|
||
console.error(`❌ Could not save state file: ${error}`);
|
||
}
|
||
}
|
||
|
||
function deleteState(stateFile: string): void {
|
||
try {
|
||
if (existsSync(stateFile)) {
|
||
unlinkSync(stateFile);
|
||
}
|
||
} catch (error) {
|
||
console.warn(`⚠️ Could not delete state file: ${error}`);
|
||
}
|
||
}
|
||
|
||
async function generateHashes(plaintext: string): Promise<HashDocument> {
|
||
const bcrypt = await import('bcrypt');
|
||
const bcryptHash = await bcrypt.default.hash(plaintext, 10);
|
||
|
||
return {
|
||
plaintext,
|
||
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
|
||
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
|
||
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
|
||
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
|
||
bcrypt: bcryptHash,
|
||
created_at: new Date().toISOString()
|
||
};
|
||
}
|
||
|
||
function showHelp() {
|
||
console.log(`
|
||
Hasher Indexer Script
|
||
|
||
Usage:
|
||
npx tsx scripts/index-file.ts <path-to-file.txt> [options]
|
||
npm run index-file -- <path-to-file.txt> [options]
|
||
|
||
Options:
|
||
--batch-size=<number> Number of items to process in each batch (default: 100)
|
||
--batch-size <number> Alternative syntax for batch size
|
||
--resume Resume from last saved state (default)
|
||
--no-resume Start from beginning, ignore saved state
|
||
--state-file=<path> Custom state file path
|
||
--help, -h Show this help message
|
||
|
||
Environment Variables:
|
||
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
|
||
|
||
Examples:
|
||
npx tsx scripts/index-file.ts wordlist.txt
|
||
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
|
||
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
|
||
npx tsx scripts/index-file.ts wordlist.txt --no-resume
|
||
npm run index-file -- wordlist.txt --batch-size=500
|
||
|
||
State Management:
|
||
The script automatically saves progress to a state file. If interrupted,
|
||
it will resume from where it left off on the next run. Use --no-resume
|
||
to start fresh.
|
||
`);
|
||
process.exit(0);
|
||
}
|
||
|
||
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
|
||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||
const absolutePath = resolve(filePath);
|
||
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
|
||
const fileHash = getFileHash(absolutePath);
|
||
|
||
// State management
|
||
let state: IndexerState = {
|
||
filePath: absolutePath,
|
||
fileHash,
|
||
lastProcessedLine: 0,
|
||
totalLines: 0,
|
||
indexed: 0,
|
||
skipped: 0,
|
||
errors: 0,
|
||
startTime: Date.now(),
|
||
lastUpdate: new Date().toISOString()
|
||
};
|
||
|
||
// Check for existing state
|
||
const existingState = loadState(stateFile);
|
||
let resumingFrom = 0;
|
||
|
||
if (shouldResume && existingState) {
|
||
if (existingState.fileHash === fileHash) {
|
||
state = existingState;
|
||
resumingFrom = state.lastProcessedLine;
|
||
state.startTime = Date.now(); // Reset start time for this session
|
||
console.log(`📂 Found existing state, resuming from line ${resumingFrom}`);
|
||
} else {
|
||
console.log(`⚠️ File has changed since last run, starting fresh`);
|
||
deleteState(stateFile);
|
||
}
|
||
} else if (!shouldResume) {
|
||
deleteState(stateFile);
|
||
}
|
||
|
||
console.log(`📚 Hasher Indexer`);
|
||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
||
console.log(`Index: ${INDEX_NAME}`);
|
||
console.log(`File: ${filePath}`);
|
||
console.log(`Batch size: ${batchSize}`);
|
||
console.log(`State file: ${stateFile}`);
|
||
if (resumingFrom > 0) {
|
||
console.log(`Resuming from: line ${resumingFrom}`);
|
||
console.log(`Already indexed: ${state.indexed}`);
|
||
console.log(`Already skipped: ${state.skipped}`);
|
||
}
|
||
console.log('');
|
||
|
||
// Handle interruption signals
|
||
let isInterrupted = false;
|
||
const handleInterrupt = () => {
|
||
if (isInterrupted) {
|
||
console.log('\n\n⚡ Force quit. Progress saved.');
|
||
process.exit(1);
|
||
}
|
||
isInterrupted = true;
|
||
console.log('\n\n⏸️ Interrupted! Saving state...');
|
||
saveState(stateFile, state);
|
||
console.log(`💾 State saved to ${stateFile}`);
|
||
console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`);
|
||
console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`);
|
||
process.exit(0);
|
||
};
|
||
|
||
process.on('SIGINT', handleInterrupt);
|
||
process.on('SIGTERM', handleInterrupt);
|
||
|
||
try {
|
||
// Test connection
|
||
console.log('🔗 Connecting to Elasticsearch...');
|
||
await client.cluster.health({});
|
||
console.log('✅ Connected successfully\n');
|
||
|
||
// Process file line by line using streams
|
||
console.log('📖 Processing file...\n');
|
||
|
||
let currentLineNumber = 0;
|
||
let currentBatch: string[] = [];
|
||
let sessionIndexed = 0;
|
||
let sessionSkipped = 0;
|
||
let sessionErrors = 0;
|
||
const sessionStartTime = Date.now();
|
||
|
||
const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' });
|
||
const rl = createInterface({
|
||
input: fileStream,
|
||
crlfDelay: Infinity
|
||
});
|
||
|
||
const processBatch = async (batch: string[], lineNumber: number) => {
|
||
if (batch.length === 0) return;
|
||
if (isInterrupted) return;
|
||
|
||
const bulkOperations: any[] = [];
|
||
|
||
// Generate hashes for all items in batch first
|
||
const batchWithHashes = await Promise.all(
|
||
batch.map(async (plaintext: string) => ({
|
||
plaintext,
|
||
hashes: await generateHashes(plaintext)
|
||
}))
|
||
);
|
||
|
||
// Check which items already exist (by plaintext or any hash)
|
||
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
||
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
||
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
|
||
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
|
||
|
||
const existingCheck = await client.search({
|
||
index: INDEX_NAME,
|
||
size: batchSize * 5,
|
||
query: {
|
||
bool: {
|
||
should: [
|
||
{ terms: { 'plaintext.keyword': batch } },
|
||
{ terms: { md5: md5List } },
|
||
{ terms: { sha1: sha1List } },
|
||
{ terms: { sha256: sha256List } },
|
||
{ terms: { sha512: sha512List } },
|
||
],
|
||
minimum_should_match: 1
|
||
}
|
||
},
|
||
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
|
||
});
|
||
|
||
// Create a set of existing hashes for quick lookup
|
||
const existingHashes = new Set<string>();
|
||
existingCheck.hits.hits.forEach((hit: any) => {
|
||
const src = hit._source;
|
||
existingHashes.add(src.plaintext);
|
||
existingHashes.add(src.md5);
|
||
existingHashes.add(src.sha1);
|
||
existingHashes.add(src.sha256);
|
||
existingHashes.add(src.sha512);
|
||
});
|
||
|
||
// Prepare bulk operations only for items that don't have any duplicate hash
|
||
let batchSkipped = 0;
|
||
for (const item of batchWithHashes) {
|
||
const isDuplicate =
|
||
existingHashes.has(item.plaintext) ||
|
||
existingHashes.has(item.hashes.md5) ||
|
||
existingHashes.has(item.hashes.sha1) ||
|
||
existingHashes.has(item.hashes.sha256) ||
|
||
existingHashes.has(item.hashes.sha512);
|
||
|
||
if (!isDuplicate) {
|
||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
||
bulkOperations.push(item.hashes);
|
||
} else {
|
||
batchSkipped++;
|
||
state.skipped++;
|
||
sessionSkipped++;
|
||
}
|
||
}
|
||
|
||
// Execute bulk operation only if there are new items to insert
|
||
if (bulkOperations.length > 0) {
|
||
try {
|
||
const bulkResponse = await client.bulk({
|
||
operations: bulkOperations,
|
||
refresh: false
|
||
});
|
||
|
||
if (bulkResponse.errors) {
|
||
const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
|
||
state.errors += errorCount;
|
||
sessionErrors += errorCount;
|
||
const successCount = (bulkOperations.length / 2) - errorCount;
|
||
state.indexed += successCount;
|
||
sessionIndexed += successCount;
|
||
} else {
|
||
const count = bulkOperations.length / 2;
|
||
state.indexed += count;
|
||
sessionIndexed += count;
|
||
}
|
||
} catch (error) {
|
||
console.error(`\n❌ Error processing batch:`, error);
|
||
const count = bulkOperations.length / 2;
|
||
state.errors += count;
|
||
sessionErrors += count;
|
||
}
|
||
}
|
||
|
||
// Update state
|
||
state.lastProcessedLine = lineNumber;
|
||
state.totalLines = lineNumber;
|
||
|
||
// Save state periodically (every 10 batches)
|
||
if (lineNumber % (batchSize * 10) === 0) {
|
||
saveState(stateFile, state);
|
||
}
|
||
|
||
// Progress indicator
|
||
const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0);
|
||
process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`);
|
||
};
|
||
|
||
for await (const line of rl) {
|
||
if (isInterrupted) break;
|
||
|
||
currentLineNumber++;
|
||
|
||
// Skip already processed lines
|
||
if (currentLineNumber <= resumingFrom) {
|
||
continue;
|
||
}
|
||
|
||
const trimmedLine = line.trim();
|
||
if (trimmedLine.length > 0) {
|
||
// Only take first word (no spaces or separators)
|
||
const firstWord = trimmedLine.split(/\s+/)[0];
|
||
if (firstWord) {
|
||
currentBatch.push(firstWord);
|
||
|
||
if (currentBatch.length >= batchSize) {
|
||
await processBatch(currentBatch, currentLineNumber);
|
||
currentBatch = [];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Process remaining items in last batch
|
||
if (currentBatch.length > 0 && !isInterrupted) {
|
||
await processBatch(currentBatch, currentLineNumber);
|
||
}
|
||
|
||
if (isInterrupted) {
|
||
return;
|
||
}
|
||
|
||
// Refresh index
|
||
console.log('\n\n🔄 Refreshing index...');
|
||
await client.indices.refresh({ index: INDEX_NAME });
|
||
|
||
// Delete state file on successful completion
|
||
deleteState(stateFile);
|
||
|
||
const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
|
||
const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0';
|
||
|
||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||
console.log('✅ Indexing complete!');
|
||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||
console.log(`Total lines processed: ${currentLineNumber}`);
|
||
if (resumingFrom > 0) {
|
||
console.log(`Lines skipped (resumed): ${resumingFrom}`);
|
||
console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`);
|
||
}
|
||
console.log(`Successfully indexed (total): ${state.indexed}`);
|
||
console.log(`Successfully indexed (session): ${sessionIndexed}`);
|
||
console.log(`Skipped duplicates (total): ${state.skipped}`);
|
||
console.log(`Skipped duplicates (session): ${sessionSkipped}`);
|
||
console.log(`Errors (total): ${state.errors}`);
|
||
console.log(`Session duration: ${duration}s`);
|
||
console.log(`Session rate: ${rate} docs/sec`);
|
||
console.log('');
|
||
|
||
} catch (error) {
|
||
// Save state on error
|
||
saveState(stateFile, state);
|
||
console.error(`\n💾 State saved to ${stateFile}`);
|
||
console.error('❌ Error:', error instanceof Error ? error.message : error);
|
||
process.exit(1);
|
||
} finally {
|
||
// Remove signal handlers
|
||
process.removeListener('SIGINT', handleInterrupt);
|
||
process.removeListener('SIGTERM', handleInterrupt);
|
||
}
|
||
}
|
||
|
||
// Parse command line arguments
|
||
const args = process.argv.slice(2);
|
||
const parsedArgs = parseArgs(args);
|
||
|
||
if (parsedArgs.showHelp || !parsedArgs.filePath) {
|
||
showHelp();
|
||
}
|
||
|
||
const filePath = parsedArgs.filePath as string;
|
||
|
||
// Validate file exists
|
||
if (!existsSync(filePath)) {
|
||
console.error(`❌ File not found: ${filePath}`);
|
||
process.exit(1);
|
||
}
|
||
|
||
console.log(`\n🔧 Configuration:`);
|
||
console.log(` File: ${filePath}`);
|
||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
||
console.log(` Resume: ${parsedArgs.resume}`);
|
||
if (parsedArgs.stateFile) {
|
||
console.log(` State file: ${parsedArgs.stateFile}`);
|
||
}
|
||
console.log('');
|
||
|
||
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
|