From 179e192e823f34b80ed35ef226cf568f7f75c1ea Mon Sep 17 00:00:00 2001 From: ale Date: Sun, 7 Dec 2025 01:28:37 +0100 Subject: [PATCH] script --no-check Signed-off-by: ale --- scripts/index-file.ts | 122 ++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/scripts/index-file.ts b/scripts/index-file.ts index 6d9a018..a2b2cde 100644 --- a/scripts/index-file.ts +++ b/scripts/index-file.ts @@ -14,6 +14,7 @@ * --batch-size= Number of items to process in each batch (default: 100) * --resume Resume from last saved state (default: true) * --no-resume Start from beginning, ignore saved state + * --no-check Skip duplicate checking (faster, but may create duplicates) * --state-file= Custom state file path (default: .indexer-state-.json) * --help, -h Show this help message */ @@ -54,6 +55,7 @@ interface ParsedArgs { filePath: string | null; batchSize: number; resume: boolean; + checkDuplicates: boolean; stateFile: string | null; showHelp: boolean; } @@ -63,6 +65,7 @@ function parseArgs(args: string[]): ParsedArgs { filePath: null, batchSize: DEFAULT_BATCH_SIZE, resume: true, + checkDuplicates: true, stateFile: null, showHelp: false }; @@ -76,6 +79,8 @@ function parseArgs(args: string[]): ParsedArgs { result.resume = true; } else if (arg === '--no-resume') { result.resume = false; + } else if (arg === '--no-check') { + result.checkDuplicates = false; } else if (arg.startsWith('--batch-size=')) { const value = arg.split('=')[1]; const parsed = parseInt(value, 10); @@ -180,6 +185,7 @@ Options: --batch-size Alternative syntax for batch size --resume Resume from last saved state (default) --no-resume Start from beginning, ignore saved state + --no-check Skip duplicate checking (faster, but may create duplicates) --state-file= Custom state file path --help, -h Show this help message @@ -191,17 +197,23 @@ Examples: npx tsx scripts/index-file.ts wordlist.txt --batch-size=500 npx tsx scripts/index-file.ts wordlist.txt --batch-size 500 npx tsx scripts/index-file.ts wordlist.txt --no-resume - npm run index-file -- wordlist.txt --batch-size=500 + npx tsx scripts/index-file.ts wordlist.txt --no-check + npm run index-file -- wordlist.txt --batch-size=500 --no-check State Management: The script automatically saves progress to a state file. If interrupted, it will resume from where it left off on the next run. Use --no-resume to start fresh. + +Duplicate Checking: + By default, the script checks if each plaintext or hash already exists + in the index before inserting. Use --no-check to skip this verification + for faster indexing (useful when you're sure there are no duplicates). `); process.exit(0); } -async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) { +async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) { const client = new Client({ node: ELASTICSEARCH_NODE }); const absolutePath = resolve(filePath); const stateFile = customStateFile || getDefaultStateFile(absolutePath); @@ -244,6 +256,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool console.log(`Index: ${INDEX_NAME}`); console.log(`File: ${filePath}`); console.log(`Batch size: ${batchSize}`); + console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`); console.log(`State file: ${stateFile}`); if (resumingFrom > 0) { console.log(`Resuming from: line ${resumingFrom}`); @@ -307,58 +320,64 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool })) ); - // Check which items already exist (by plaintext or any hash) - const md5List = batchWithHashes.map((item: any) => item.hashes.md5); - const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1); - const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256); - const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512); + if (checkDuplicates) { + // Check which items already exist (by plaintext or any hash) + const md5List = batchWithHashes.map((item: any) => item.hashes.md5); + const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1); + const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256); + const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512); - const existingCheck = await client.search({ - index: INDEX_NAME, - size: batchSize * 5, - query: { - bool: { - should: [ - { terms: { 'plaintext.keyword': batch } }, - { terms: { md5: md5List } }, - { terms: { sha1: sha1List } }, - { terms: { sha256: sha256List } }, - { terms: { sha512: sha512List } }, - ], - minimum_should_match: 1 + const existingCheck = await client.search({ + index: INDEX_NAME, + size: batchSize * 5, + query: { + bool: { + should: [ + { terms: { 'plaintext.keyword': batch } }, + { terms: { md5: md5List } }, + { terms: { sha1: sha1List } }, + { terms: { sha256: sha256List } }, + { terms: { sha512: sha512List } }, + ], + minimum_should_match: 1 + } + }, + _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'] + }); + + // Create a set of existing hashes for quick lookup + const existingHashes = new Set(); + existingCheck.hits.hits.forEach((hit: any) => { + const src = hit._source; + existingHashes.add(src.plaintext); + existingHashes.add(src.md5); + existingHashes.add(src.sha1); + existingHashes.add(src.sha256); + existingHashes.add(src.sha512); + }); + + // Prepare bulk operations only for items that don't have any duplicate hash + for (const item of batchWithHashes) { + const isDuplicate = + existingHashes.has(item.plaintext) || + existingHashes.has(item.hashes.md5) || + existingHashes.has(item.hashes.sha1) || + existingHashes.has(item.hashes.sha256) || + existingHashes.has(item.hashes.sha512); + + if (!isDuplicate) { + bulkOperations.push({ index: { _index: INDEX_NAME } }); + bulkOperations.push(item.hashes); + } else { + state.skipped++; + sessionSkipped++; } - }, - _source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'] - }); - - // Create a set of existing hashes for quick lookup - const existingHashes = new Set(); - existingCheck.hits.hits.forEach((hit: any) => { - const src = hit._source; - existingHashes.add(src.plaintext); - existingHashes.add(src.md5); - existingHashes.add(src.sha1); - existingHashes.add(src.sha256); - existingHashes.add(src.sha512); - }); - - // Prepare bulk operations only for items that don't have any duplicate hash - let batchSkipped = 0; - for (const item of batchWithHashes) { - const isDuplicate = - existingHashes.has(item.plaintext) || - existingHashes.has(item.hashes.md5) || - existingHashes.has(item.hashes.sha1) || - existingHashes.has(item.hashes.sha256) || - existingHashes.has(item.hashes.sha512); - - if (!isDuplicate) { + } + } else { + // No duplicate checking - index everything + for (const item of batchWithHashes) { bulkOperations.push({ index: { _index: INDEX_NAME } }); bulkOperations.push(item.hashes); - } else { - batchSkipped++; - state.skipped++; - sessionSkipped++; } } @@ -498,9 +517,10 @@ console.log(`\n🔧 Configuration:`); console.log(` File: ${filePath}`); console.log(` Batch size: ${parsedArgs.batchSize}`); console.log(` Resume: ${parsedArgs.resume}`); +console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`); if (parsedArgs.stateFile) { console.log(` State file: ${parsedArgs.stateFile}`); } console.log(''); -indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error); +indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);