diff --git a/scripts/index-file.ts b/scripts/index-file.ts index 5fd9963..a094e84 100644 --- a/scripts/index-file.ts +++ b/scripts/index-file.ts @@ -17,8 +17,9 @@ */ import { Client } from '@elastic/elasticsearch'; -import { readFileSync } from 'fs'; +import { createReadStream } from 'fs'; import { resolve } from 'path'; +import { createInterface } from 'readline'; import crypto from 'crypto'; const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200'; @@ -89,43 +90,46 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ await client.cluster.health({}); console.log('✅ Connected successfully\n'); - // Read file - console.log('📖 Reading file...'); + // Process file line by line using streams + console.log('📖 Processing file...\n'); const absolutePath = resolve(filePath); - const content = readFileSync(absolutePath, 'utf-8'); - const lines = content.split('\n') - .map(line => line.trim()) - .filter(line => line.length > 0); - console.log(`✅ Found ${lines.length} words/phrases to process\n`); - - // Process in batches + let totalLines = 0; let indexed = 0; let skipped = 0; let errors = 0; const startTime = Date.now(); + + let currentBatch: string[] = []; + + const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' }); + const rl = createInterface({ + input: fileStream, + crlfDelay: Infinity + }); + + const processBatch = async (batch: string[]) => { + if (batch.length === 0) return; - for (let i = 0; i < lines.length; i += batchSize) { - const batch = lines.slice(i, i + batchSize); const bulkOperations: any[] = []; // Generate hashes for all items in batch first const batchWithHashes = await Promise.all( - batch.map(async (plaintext) => ({ + batch.map(async (plaintext: string) => ({ plaintext, hashes: await generateHashes(plaintext) })) ); // Check which items already exist (by plaintext or any hash) - const md5List = batchWithHashes.map(item => item.hashes.md5); - const sha1List = batchWithHashes.map(item => item.hashes.sha1); - const sha256List = batchWithHashes.map(item => item.hashes.sha256); - const sha512List = batchWithHashes.map(item => item.hashes.sha512); + const md5List = batchWithHashes.map((item: any) => item.hashes.md5); + const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1); + const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256); + const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512); const existingCheck = await client.search({ index: INDEX_NAME, - size: batchSize * 5, // Account for potential multiple matches + size: batchSize * 5, query: { bool: { should: [ @@ -185,15 +189,35 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ indexed += bulkOperations.length / 2; } } catch (error) { - console.error(`\n❌ Error processing batch ${i}-${i + batchSize}:`, error); + console.error(`\n❌ Error processing batch:`, error); errors += bulkOperations.length / 2; } } // Progress indicator - const progress = Math.min(i + batchSize, lines.length); - const percent = ((progress / lines.length) * 100).toFixed(1); - process.stdout.write(`\r⏳ Progress: ${progress}/${lines.length} (${percent}%) - Indexed: ${indexed}, Skipped: ${skipped}, Errors: ${errors}`); + process.stdout.write(`\r⏳ Processed: ${totalLines} - Indexed: ${indexed}, Skipped: ${skipped}, Errors: ${errors}`); + }; + + for await (const line of rl) { + const trimmedLine = line.trim(); + if (trimmedLine.length > 0) { + // Only take first word (no spaces or separators) + const firstWord = trimmedLine.split(/\s+/)[0]; + if (firstWord) { + totalLines++; + currentBatch.push(firstWord); + + if (currentBatch.length >= batchSize) { + await processBatch(currentBatch); + currentBatch = []; + } + } + } + } + + // Process remaining items in last batch + if (currentBatch.length > 0) { + await processBatch(currentBatch); } // Refresh index @@ -206,7 +230,7 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('✅ Indexing complete!'); console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`); - console.log(`Total processed: ${lines.length}`); + console.log(`Total processed: ${totalLines}`); console.log(`Successfully indexed: ${indexed}`); console.log(`Skipped (duplicates): ${skipped}`); console.log(`Errors: ${errors}`);