index file stream big files
Signed-off-by: ale <ale@manalejandro.com>
Este commit está contenido en:
@@ -17,8 +17,9 @@
|
||||
*/
|
||||
|
||||
import { Client } from '@elastic/elasticsearch';
|
||||
import { readFileSync } from 'fs';
|
||||
import { createReadStream } from 'fs';
|
||||
import { resolve } from 'path';
|
||||
import { createInterface } from 'readline';
|
||||
import crypto from 'crypto';
|
||||
|
||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
||||
@@ -89,43 +90,46 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ
|
||||
await client.cluster.health({});
|
||||
console.log('✅ Connected successfully\n');
|
||||
|
||||
// Read file
|
||||
console.log('📖 Reading file...');
|
||||
// Process file line by line using streams
|
||||
console.log('📖 Processing file...\n');
|
||||
const absolutePath = resolve(filePath);
|
||||
const content = readFileSync(absolutePath, 'utf-8');
|
||||
const lines = content.split('\n')
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.length > 0);
|
||||
|
||||
console.log(`✅ Found ${lines.length} words/phrases to process\n`);
|
||||
|
||||
// Process in batches
|
||||
let totalLines = 0;
|
||||
let indexed = 0;
|
||||
let skipped = 0;
|
||||
let errors = 0;
|
||||
const startTime = Date.now();
|
||||
|
||||
let currentBatch: string[] = [];
|
||||
|
||||
const fileStream = createReadStream(absolutePath, { encoding: 'utf-8' });
|
||||
const rl = createInterface({
|
||||
input: fileStream,
|
||||
crlfDelay: Infinity
|
||||
});
|
||||
|
||||
const processBatch = async (batch: string[]) => {
|
||||
if (batch.length === 0) return;
|
||||
|
||||
for (let i = 0; i < lines.length; i += batchSize) {
|
||||
const batch = lines.slice(i, i + batchSize);
|
||||
const bulkOperations: any[] = [];
|
||||
|
||||
// Generate hashes for all items in batch first
|
||||
const batchWithHashes = await Promise.all(
|
||||
batch.map(async (plaintext) => ({
|
||||
batch.map(async (plaintext: string) => ({
|
||||
plaintext,
|
||||
hashes: await generateHashes(plaintext)
|
||||
}))
|
||||
);
|
||||
|
||||
// Check which items already exist (by plaintext or any hash)
|
||||
const md5List = batchWithHashes.map(item => item.hashes.md5);
|
||||
const sha1List = batchWithHashes.map(item => item.hashes.sha1);
|
||||
const sha256List = batchWithHashes.map(item => item.hashes.sha256);
|
||||
const sha512List = batchWithHashes.map(item => item.hashes.sha512);
|
||||
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
||||
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
||||
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
|
||||
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
|
||||
|
||||
const existingCheck = await client.search({
|
||||
index: INDEX_NAME,
|
||||
size: batchSize * 5, // Account for potential multiple matches
|
||||
size: batchSize * 5,
|
||||
query: {
|
||||
bool: {
|
||||
should: [
|
||||
@@ -185,15 +189,35 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ
|
||||
indexed += bulkOperations.length / 2;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`\n❌ Error processing batch ${i}-${i + batchSize}:`, error);
|
||||
console.error(`\n❌ Error processing batch:`, error);
|
||||
errors += bulkOperations.length / 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress indicator
|
||||
const progress = Math.min(i + batchSize, lines.length);
|
||||
const percent = ((progress / lines.length) * 100).toFixed(1);
|
||||
process.stdout.write(`\r⏳ Progress: ${progress}/${lines.length} (${percent}%) - Indexed: ${indexed}, Skipped: ${skipped}, Errors: ${errors}`);
|
||||
process.stdout.write(`\r⏳ Processed: ${totalLines} - Indexed: ${indexed}, Skipped: ${skipped}, Errors: ${errors}`);
|
||||
};
|
||||
|
||||
for await (const line of rl) {
|
||||
const trimmedLine = line.trim();
|
||||
if (trimmedLine.length > 0) {
|
||||
// Only take first word (no spaces or separators)
|
||||
const firstWord = trimmedLine.split(/\s+/)[0];
|
||||
if (firstWord) {
|
||||
totalLines++;
|
||||
currentBatch.push(firstWord);
|
||||
|
||||
if (currentBatch.length >= batchSize) {
|
||||
await processBatch(currentBatch);
|
||||
currentBatch = [];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining items in last batch
|
||||
if (currentBatch.length > 0) {
|
||||
await processBatch(currentBatch);
|
||||
}
|
||||
|
||||
// Refresh index
|
||||
@@ -206,7 +230,7 @@ async function indexFile(filePath: string, batchSize: number = DEFAULT_BATCH_SIZ
|
||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log('✅ Indexing complete!');
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Total processed: ${lines.length}`);
|
||||
console.log(`Total processed: ${totalLines}`);
|
||||
console.log(`Successfully indexed: ${indexed}`);
|
||||
console.log(`Skipped (duplicates): ${skipped}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
|
||||
Referencia en una nueva incidencia
Block a user