@@ -14,6 +14,7 @@
|
|||||||
* --batch-size=<number> Number of items to process in each batch (default: 100)
|
* --batch-size=<number> Number of items to process in each batch (default: 100)
|
||||||
* --resume Resume from last saved state (default: true)
|
* --resume Resume from last saved state (default: true)
|
||||||
* --no-resume Start from beginning, ignore saved state
|
* --no-resume Start from beginning, ignore saved state
|
||||||
|
* --no-check Skip duplicate checking (faster, but may create duplicates)
|
||||||
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
|
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
|
||||||
* --help, -h Show this help message
|
* --help, -h Show this help message
|
||||||
*/
|
*/
|
||||||
@@ -54,6 +55,7 @@ interface ParsedArgs {
|
|||||||
filePath: string | null;
|
filePath: string | null;
|
||||||
batchSize: number;
|
batchSize: number;
|
||||||
resume: boolean;
|
resume: boolean;
|
||||||
|
checkDuplicates: boolean;
|
||||||
stateFile: string | null;
|
stateFile: string | null;
|
||||||
showHelp: boolean;
|
showHelp: boolean;
|
||||||
}
|
}
|
||||||
@@ -63,6 +65,7 @@ function parseArgs(args: string[]): ParsedArgs {
|
|||||||
filePath: null,
|
filePath: null,
|
||||||
batchSize: DEFAULT_BATCH_SIZE,
|
batchSize: DEFAULT_BATCH_SIZE,
|
||||||
resume: true,
|
resume: true,
|
||||||
|
checkDuplicates: true,
|
||||||
stateFile: null,
|
stateFile: null,
|
||||||
showHelp: false
|
showHelp: false
|
||||||
};
|
};
|
||||||
@@ -76,6 +79,8 @@ function parseArgs(args: string[]): ParsedArgs {
|
|||||||
result.resume = true;
|
result.resume = true;
|
||||||
} else if (arg === '--no-resume') {
|
} else if (arg === '--no-resume') {
|
||||||
result.resume = false;
|
result.resume = false;
|
||||||
|
} else if (arg === '--no-check') {
|
||||||
|
result.checkDuplicates = false;
|
||||||
} else if (arg.startsWith('--batch-size=')) {
|
} else if (arg.startsWith('--batch-size=')) {
|
||||||
const value = arg.split('=')[1];
|
const value = arg.split('=')[1];
|
||||||
const parsed = parseInt(value, 10);
|
const parsed = parseInt(value, 10);
|
||||||
@@ -180,6 +185,7 @@ Options:
|
|||||||
--batch-size <number> Alternative syntax for batch size
|
--batch-size <number> Alternative syntax for batch size
|
||||||
--resume Resume from last saved state (default)
|
--resume Resume from last saved state (default)
|
||||||
--no-resume Start from beginning, ignore saved state
|
--no-resume Start from beginning, ignore saved state
|
||||||
|
--no-check Skip duplicate checking (faster, but may create duplicates)
|
||||||
--state-file=<path> Custom state file path
|
--state-file=<path> Custom state file path
|
||||||
--help, -h Show this help message
|
--help, -h Show this help message
|
||||||
|
|
||||||
@@ -191,17 +197,23 @@ Examples:
|
|||||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
|
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
|
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --no-resume
|
npx tsx scripts/index-file.ts wordlist.txt --no-resume
|
||||||
npm run index-file -- wordlist.txt --batch-size=500
|
npx tsx scripts/index-file.ts wordlist.txt --no-check
|
||||||
|
npm run index-file -- wordlist.txt --batch-size=500 --no-check
|
||||||
|
|
||||||
State Management:
|
State Management:
|
||||||
The script automatically saves progress to a state file. If interrupted,
|
The script automatically saves progress to a state file. If interrupted,
|
||||||
it will resume from where it left off on the next run. Use --no-resume
|
it will resume from where it left off on the next run. Use --no-resume
|
||||||
to start fresh.
|
to start fresh.
|
||||||
|
|
||||||
|
Duplicate Checking:
|
||||||
|
By default, the script checks if each plaintext or hash already exists
|
||||||
|
in the index before inserting. Use --no-check to skip this verification
|
||||||
|
for faster indexing (useful when you're sure there are no duplicates).
|
||||||
`);
|
`);
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
|
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
|
||||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||||||
const absolutePath = resolve(filePath);
|
const absolutePath = resolve(filePath);
|
||||||
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
|
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
|
||||||
@@ -244,6 +256,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
console.log(`Index: ${INDEX_NAME}`);
|
console.log(`Index: ${INDEX_NAME}`);
|
||||||
console.log(`File: ${filePath}`);
|
console.log(`File: ${filePath}`);
|
||||||
console.log(`Batch size: ${batchSize}`);
|
console.log(`Batch size: ${batchSize}`);
|
||||||
|
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
|
||||||
console.log(`State file: ${stateFile}`);
|
console.log(`State file: ${stateFile}`);
|
||||||
if (resumingFrom > 0) {
|
if (resumingFrom > 0) {
|
||||||
console.log(`Resuming from: line ${resumingFrom}`);
|
console.log(`Resuming from: line ${resumingFrom}`);
|
||||||
@@ -307,58 +320,64 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
}))
|
}))
|
||||||
);
|
);
|
||||||
|
|
||||||
// Check which items already exist (by plaintext or any hash)
|
if (checkDuplicates) {
|
||||||
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
// Check which items already exist (by plaintext or any hash)
|
||||||
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
||||||
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
|
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
||||||
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
|
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
|
||||||
|
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
|
||||||
|
|
||||||
const existingCheck = await client.search({
|
const existingCheck = await client.search({
|
||||||
index: INDEX_NAME,
|
index: INDEX_NAME,
|
||||||
size: batchSize * 5,
|
size: batchSize * 5,
|
||||||
query: {
|
query: {
|
||||||
bool: {
|
bool: {
|
||||||
should: [
|
should: [
|
||||||
{ terms: { 'plaintext.keyword': batch } },
|
{ terms: { 'plaintext.keyword': batch } },
|
||||||
{ terms: { md5: md5List } },
|
{ terms: { md5: md5List } },
|
||||||
{ terms: { sha1: sha1List } },
|
{ terms: { sha1: sha1List } },
|
||||||
{ terms: { sha256: sha256List } },
|
{ terms: { sha256: sha256List } },
|
||||||
{ terms: { sha512: sha512List } },
|
{ terms: { sha512: sha512List } },
|
||||||
],
|
],
|
||||||
minimum_should_match: 1
|
minimum_should_match: 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
|
||||||
|
});
|
||||||
|
|
||||||
|
// Create a set of existing hashes for quick lookup
|
||||||
|
const existingHashes = new Set<string>();
|
||||||
|
existingCheck.hits.hits.forEach((hit: any) => {
|
||||||
|
const src = hit._source;
|
||||||
|
existingHashes.add(src.plaintext);
|
||||||
|
existingHashes.add(src.md5);
|
||||||
|
existingHashes.add(src.sha1);
|
||||||
|
existingHashes.add(src.sha256);
|
||||||
|
existingHashes.add(src.sha512);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Prepare bulk operations only for items that don't have any duplicate hash
|
||||||
|
for (const item of batchWithHashes) {
|
||||||
|
const isDuplicate =
|
||||||
|
existingHashes.has(item.plaintext) ||
|
||||||
|
existingHashes.has(item.hashes.md5) ||
|
||||||
|
existingHashes.has(item.hashes.sha1) ||
|
||||||
|
existingHashes.has(item.hashes.sha256) ||
|
||||||
|
existingHashes.has(item.hashes.sha512);
|
||||||
|
|
||||||
|
if (!isDuplicate) {
|
||||||
|
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
||||||
|
bulkOperations.push(item.hashes);
|
||||||
|
} else {
|
||||||
|
state.skipped++;
|
||||||
|
sessionSkipped++;
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
|
} else {
|
||||||
});
|
// No duplicate checking - index everything
|
||||||
|
for (const item of batchWithHashes) {
|
||||||
// Create a set of existing hashes for quick lookup
|
|
||||||
const existingHashes = new Set<string>();
|
|
||||||
existingCheck.hits.hits.forEach((hit: any) => {
|
|
||||||
const src = hit._source;
|
|
||||||
existingHashes.add(src.plaintext);
|
|
||||||
existingHashes.add(src.md5);
|
|
||||||
existingHashes.add(src.sha1);
|
|
||||||
existingHashes.add(src.sha256);
|
|
||||||
existingHashes.add(src.sha512);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Prepare bulk operations only for items that don't have any duplicate hash
|
|
||||||
let batchSkipped = 0;
|
|
||||||
for (const item of batchWithHashes) {
|
|
||||||
const isDuplicate =
|
|
||||||
existingHashes.has(item.plaintext) ||
|
|
||||||
existingHashes.has(item.hashes.md5) ||
|
|
||||||
existingHashes.has(item.hashes.sha1) ||
|
|
||||||
existingHashes.has(item.hashes.sha256) ||
|
|
||||||
existingHashes.has(item.hashes.sha512);
|
|
||||||
|
|
||||||
if (!isDuplicate) {
|
|
||||||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
||||||
bulkOperations.push(item.hashes);
|
bulkOperations.push(item.hashes);
|
||||||
} else {
|
|
||||||
batchSkipped++;
|
|
||||||
state.skipped++;
|
|
||||||
sessionSkipped++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -498,9 +517,10 @@ console.log(`\n🔧 Configuration:`);
|
|||||||
console.log(` File: ${filePath}`);
|
console.log(` File: ${filePath}`);
|
||||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
||||||
console.log(` Resume: ${parsedArgs.resume}`);
|
console.log(` Resume: ${parsedArgs.resume}`);
|
||||||
|
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
|
||||||
if (parsedArgs.stateFile) {
|
if (parsedArgs.stateFile) {
|
||||||
console.log(` State file: ${parsedArgs.stateFile}`);
|
console.log(` State file: ${parsedArgs.stateFile}`);
|
||||||
}
|
}
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
|
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);
|
||||||
|
|||||||
Referencia en una nueva incidencia
Block a user