script --no-check

Signed-off-by: ale <ale@manalejandro.com>
2025-12-07 01:28:37 +01:00
commit 179e192e82
--- a/scripts/index-file.ts
+++ b/scripts/index-file.ts
@@ -14,6 +14,7 @@
 *   --batch-size=<number>  Number of items to process in each batch (default: 100)
 *   --resume               Resume from last saved state (default: true)
 *   --no-resume            Start from beginning, ignore saved state
 *   --no-check             Skip duplicate checking (faster, but may create duplicates)
 *   --state-file=<path>    Custom state file path (default: .indexer-state-<filename>.json)
 *   --help, -h             Show this help message
 */
@@ -54,6 +55,7 @@ interface ParsedArgs {
  filePath: string | null;
  batchSize: number;
  resume: boolean;
  checkDuplicates: boolean;
  stateFile: string | null;
  showHelp: boolean;
 }
@@ -63,6 +65,7 @@ function parseArgs(args: string[]): ParsedArgs {
    filePath: null,
    batchSize: DEFAULT_BATCH_SIZE,
    resume: true,
    checkDuplicates: true,
    stateFile: null,
    showHelp: false
  };
@@ -76,6 +79,8 @@ function parseArgs(args: string[]): ParsedArgs {
      result.resume = true;
    } else if (arg === '--no-resume') {
      result.resume = false;
    } else if (arg === '--no-check') {
      result.checkDuplicates = false;
    } else if (arg.startsWith('--batch-size=')) {
      const value = arg.split('=')[1];
      const parsed = parseInt(value, 10);
@@ -180,6 +185,7 @@ Options:
  --batch-size <number>  Alternative syntax for batch size
  --resume               Resume from last saved state (default)
  --no-resume            Start from beginning, ignore saved state
  --no-check             Skip duplicate checking (faster, but may create duplicates)
  --state-file=<path>    Custom state file path
  --help, -h             Show this help message
@@ -191,17 +197,23 @@ Examples:
  npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
  npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
  npx tsx scripts/index-file.ts wordlist.txt --no-resume
-  npm run index-file -- wordlist.txt --batch-size=500
+  npx tsx scripts/index-file.ts wordlist.txt --no-check
  npm run index-file -- wordlist.txt --batch-size=500 --no-check
 State Management:
  The script automatically saves progress to a state file. If interrupted,
  it will resume from where it left off on the next run. Use --no-resume
  to start fresh.
 Duplicate Checking:
  By default, the script checks if each plaintext or hash already exists
  in the index before inserting. Use --no-check to skip this verification
  for faster indexing (useful when you're sure there are no duplicates).
 `);
  process.exit(0);
 }
-async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
+async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
  const client = new Client({ node: ELASTICSEARCH_NODE });
  const absolutePath = resolve(filePath);
  const stateFile = customStateFile || getDefaultStateFile(absolutePath);
@@ -244,6 +256,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
  console.log(`Index: ${INDEX_NAME}`);
  console.log(`File: ${filePath}`);
  console.log(`Batch size: ${batchSize}`);
  console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
  console.log(`State file: ${stateFile}`);
  if (resumingFrom > 0) {
    console.log(`Resuming from: line ${resumingFrom}`);
@@ -307,6 +320,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
        }))
      );
      if (checkDuplicates) {
        // Check which items already exist (by plaintext or any hash)
        const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
        const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
@@ -343,7 +357,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
        });
        // Prepare bulk operations only for items that don't have any duplicate hash
      let batchSkipped = 0;
        for (const item of batchWithHashes) {
          const isDuplicate = 
            existingHashes.has(item.plaintext) ||
@@ -356,11 +369,17 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
            bulkOperations.push({ index: { _index: INDEX_NAME } });
            bulkOperations.push(item.hashes);
          } else {
          batchSkipped++;
            state.skipped++;
            sessionSkipped++;
          }
        }
      } else {
        // No duplicate checking - index everything
        for (const item of batchWithHashes) {
          bulkOperations.push({ index: { _index: INDEX_NAME } });
          bulkOperations.push(item.hashes);
        }
      }
      // Execute bulk operation only if there are new items to insert
      if (bulkOperations.length > 0) {
@@ -498,9 +517,10 @@ console.log(`\n🔧 Configuration:`);
 console.log(`   File: ${filePath}`);
 console.log(`   Batch size: ${parsedArgs.batchSize}`);
 console.log(`   Resume: ${parsedArgs.resume}`);
 console.log(`   Check duplicates: ${parsedArgs.checkDuplicates}`);
 if (parsedArgs.stateFile) {
  console.log(`   State file: ${parsedArgs.stateFile}`);
 }
 console.log('');
-indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
+indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);