script --no-check

Signed-off-by: ale <ale@manalejandro.com>
Este commit está contenido en:
ale
2025-12-07 01:28:37 +01:00
padre bb234fef1e
commit 179e192e82

Ver fichero

@@ -14,6 +14,7 @@
* --batch-size=<number> Number of items to process in each batch (default: 100) * --batch-size=<number> Number of items to process in each batch (default: 100)
* --resume Resume from last saved state (default: true) * --resume Resume from last saved state (default: true)
* --no-resume Start from beginning, ignore saved state * --no-resume Start from beginning, ignore saved state
* --no-check Skip duplicate checking (faster, but may create duplicates)
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json) * --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
* --help, -h Show this help message * --help, -h Show this help message
*/ */
@@ -54,6 +55,7 @@ interface ParsedArgs {
filePath: string | null; filePath: string | null;
batchSize: number; batchSize: number;
resume: boolean; resume: boolean;
checkDuplicates: boolean;
stateFile: string | null; stateFile: string | null;
showHelp: boolean; showHelp: boolean;
} }
@@ -63,6 +65,7 @@ function parseArgs(args: string[]): ParsedArgs {
filePath: null, filePath: null,
batchSize: DEFAULT_BATCH_SIZE, batchSize: DEFAULT_BATCH_SIZE,
resume: true, resume: true,
checkDuplicates: true,
stateFile: null, stateFile: null,
showHelp: false showHelp: false
}; };
@@ -76,6 +79,8 @@ function parseArgs(args: string[]): ParsedArgs {
result.resume = true; result.resume = true;
} else if (arg === '--no-resume') { } else if (arg === '--no-resume') {
result.resume = false; result.resume = false;
} else if (arg === '--no-check') {
result.checkDuplicates = false;
} else if (arg.startsWith('--batch-size=')) { } else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1]; const value = arg.split('=')[1];
const parsed = parseInt(value, 10); const parsed = parseInt(value, 10);
@@ -180,6 +185,7 @@ Options:
--batch-size <number> Alternative syntax for batch size --batch-size <number> Alternative syntax for batch size
--resume Resume from last saved state (default) --resume Resume from last saved state (default)
--no-resume Start from beginning, ignore saved state --no-resume Start from beginning, ignore saved state
--no-check Skip duplicate checking (faster, but may create duplicates)
--state-file=<path> Custom state file path --state-file=<path> Custom state file path
--help, -h Show this help message --help, -h Show this help message
@@ -191,17 +197,23 @@ Examples:
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500 npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500 npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
npx tsx scripts/index-file.ts wordlist.txt --no-resume npx tsx scripts/index-file.ts wordlist.txt --no-resume
npm run index-file -- wordlist.txt --batch-size=500 npx tsx scripts/index-file.ts wordlist.txt --no-check
npm run index-file -- wordlist.txt --batch-size=500 --no-check
State Management: State Management:
The script automatically saves progress to a state file. If interrupted, The script automatically saves progress to a state file. If interrupted,
it will resume from where it left off on the next run. Use --no-resume it will resume from where it left off on the next run. Use --no-resume
to start fresh. to start fresh.
Duplicate Checking:
By default, the script checks if each plaintext or hash already exists
in the index before inserting. Use --no-check to skip this verification
for faster indexing (useful when you're sure there are no duplicates).
`); `);
process.exit(0); process.exit(0);
} }
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) { async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
const client = new Client({ node: ELASTICSEARCH_NODE }); const client = new Client({ node: ELASTICSEARCH_NODE });
const absolutePath = resolve(filePath); const absolutePath = resolve(filePath);
const stateFile = customStateFile || getDefaultStateFile(absolutePath); const stateFile = customStateFile || getDefaultStateFile(absolutePath);
@@ -244,6 +256,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
console.log(`Index: ${INDEX_NAME}`); console.log(`Index: ${INDEX_NAME}`);
console.log(`File: ${filePath}`); console.log(`File: ${filePath}`);
console.log(`Batch size: ${batchSize}`); console.log(`Batch size: ${batchSize}`);
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
console.log(`State file: ${stateFile}`); console.log(`State file: ${stateFile}`);
if (resumingFrom > 0) { if (resumingFrom > 0) {
console.log(`Resuming from: line ${resumingFrom}`); console.log(`Resuming from: line ${resumingFrom}`);
@@ -307,6 +320,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
})) }))
); );
if (checkDuplicates) {
// Check which items already exist (by plaintext or any hash) // Check which items already exist (by plaintext or any hash)
const md5List = batchWithHashes.map((item: any) => item.hashes.md5); const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1); const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
@@ -343,7 +357,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
}); });
// Prepare bulk operations only for items that don't have any duplicate hash // Prepare bulk operations only for items that don't have any duplicate hash
let batchSkipped = 0;
for (const item of batchWithHashes) { for (const item of batchWithHashes) {
const isDuplicate = const isDuplicate =
existingHashes.has(item.plaintext) || existingHashes.has(item.plaintext) ||
@@ -356,11 +369,17 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
bulkOperations.push({ index: { _index: INDEX_NAME } }); bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes); bulkOperations.push(item.hashes);
} else { } else {
batchSkipped++;
state.skipped++; state.skipped++;
sessionSkipped++; sessionSkipped++;
} }
} }
} else {
// No duplicate checking - index everything
for (const item of batchWithHashes) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
}
}
// Execute bulk operation only if there are new items to insert // Execute bulk operation only if there are new items to insert
if (bulkOperations.length > 0) { if (bulkOperations.length > 0) {
@@ -498,9 +517,10 @@ console.log(`\n🔧 Configuration:`);
console.log(` File: ${filePath}`); console.log(` File: ${filePath}`);
console.log(` Batch size: ${parsedArgs.batchSize}`); console.log(` Batch size: ${parsedArgs.batchSize}`);
console.log(` Resume: ${parsedArgs.resume}`); console.log(` Resume: ${parsedArgs.resume}`);
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
if (parsedArgs.stateFile) { if (parsedArgs.stateFile) {
console.log(` State file: ${parsedArgs.stateFile}`); console.log(` State file: ${parsedArgs.stateFile}`);
} }
console.log(''); console.log('');
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error); indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);