Comparar commits
5 Commits
8fa586731a
...
elasticsea
| Autor | SHA1 | Fecha | |
|---|---|---|---|
|
b91d19dc0b
|
|||
|
da89037125
|
|||
|
20f0503134
|
|||
|
42bc5a15d0
|
|||
|
2de78b7461
|
@@ -11,13 +11,101 @@ interface HashDocument {
|
|||||||
created_at?: string;
|
created_at?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Maximum allowed query length
|
||||||
|
const MAX_QUERY_LENGTH = 1000;
|
||||||
|
|
||||||
|
// Characters that could be used in NoSQL/Elasticsearch injection attacks
|
||||||
|
const DANGEROUS_PATTERNS = [
|
||||||
|
/[{}\[\]]/g, // JSON structure characters
|
||||||
|
/\$[a-zA-Z]/g, // MongoDB-style operators
|
||||||
|
/\\u[0-9a-fA-F]{4}/g, // Unicode escapes
|
||||||
|
/<script/gi, // XSS attempts
|
||||||
|
/javascript:/gi, // XSS attempts
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sanitize input to prevent NoSQL injection attacks
|
||||||
|
* For hash lookups, we only need alphanumeric characters and $
|
||||||
|
* For plaintext, we allow more characters but sanitize dangerous patterns
|
||||||
|
*/
|
||||||
|
function sanitizeInput(input: string): string {
|
||||||
|
// Trim and take first word only
|
||||||
|
let sanitized = input.trim().split(/\s+/)[0] || '';
|
||||||
|
|
||||||
|
// Limit length
|
||||||
|
if (sanitized.length > MAX_QUERY_LENGTH) {
|
||||||
|
sanitized = sanitized.substring(0, MAX_QUERY_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove null bytes
|
||||||
|
sanitized = sanitized.replace(/\0/g, '');
|
||||||
|
|
||||||
|
// Check for dangerous patterns
|
||||||
|
for (const pattern of DANGEROUS_PATTERNS) {
|
||||||
|
sanitized = sanitized.replace(pattern, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
return sanitized;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate that the input is safe for use in Elasticsearch queries
|
||||||
|
*/
|
||||||
|
function isValidInput(input: string): boolean {
|
||||||
|
// Check for empty input
|
||||||
|
if (!input || input.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for excessively long input
|
||||||
|
if (input.length > MAX_QUERY_LENGTH) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for control characters (except normal whitespace)
|
||||||
|
if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(input)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
const { query } = await request.json();
|
const body = await request.json();
|
||||||
|
|
||||||
|
// Validate request body structure
|
||||||
|
if (!body || typeof body !== 'object') {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Invalid request body' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { query } = body;
|
||||||
|
|
||||||
|
// Validate query type
|
||||||
if (!query || typeof query !== 'string') {
|
if (!query || typeof query !== 'string') {
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{ error: 'Query parameter is required' },
|
{ error: 'Query parameter is required and must be a string' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate input before processing
|
||||||
|
if (!isValidInput(query)) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Invalid query: contains forbidden characters or is too long' },
|
||||||
|
{ status: 400 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanitize input
|
||||||
|
const cleanQuery = sanitizeInput(query);
|
||||||
|
|
||||||
|
if (!cleanQuery) {
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Invalid query: only whitespace or invalid characters provided' },
|
||||||
{ status: 400 }
|
{ status: 400 }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -25,15 +113,6 @@ export async function POST(request: NextRequest) {
|
|||||||
// Ensure index exists
|
// Ensure index exists
|
||||||
await initializeIndex();
|
await initializeIndex();
|
||||||
|
|
||||||
const cleanQuery = query.trim().split(/\s+/)[0];
|
|
||||||
|
|
||||||
if (!cleanQuery) {
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: 'Invalid query: only whitespace provided' },
|
|
||||||
{ status: 400 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const cleanQueryLower = cleanQuery.toLowerCase();
|
const cleanQueryLower = cleanQuery.toLowerCase();
|
||||||
const hashType = detectHashType(cleanQueryLower);
|
const hashType = detectHashType(cleanQueryLower);
|
||||||
|
|
||||||
|
|||||||
143
app/page.tsx
143
app/page.tsx
@@ -1,7 +1,8 @@
|
|||||||
'use client';
|
'use client';
|
||||||
|
|
||||||
import { useState, useEffect } from 'react';
|
import { useState, useEffect, useCallback, Suspense } from 'react';
|
||||||
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2, Database } from 'lucide-react';
|
import { useSearchParams } from 'next/navigation';
|
||||||
|
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2, Database, Link } from 'lucide-react';
|
||||||
|
|
||||||
interface SearchResult {
|
interface SearchResult {
|
||||||
found: boolean;
|
found: boolean;
|
||||||
@@ -45,13 +46,62 @@ function formatNumber(num: number): string {
|
|||||||
return num.toLocaleString();
|
return num.toLocaleString();
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function Home() {
|
function HasherContent() {
|
||||||
|
const searchParams = useSearchParams();
|
||||||
const [query, setQuery] = useState('');
|
const [query, setQuery] = useState('');
|
||||||
const [result, setResult] = useState<SearchResult | null>(null);
|
const [result, setResult] = useState<SearchResult | null>(null);
|
||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
const [error, setError] = useState('');
|
const [error, setError] = useState('');
|
||||||
const [copiedField, setCopiedField] = useState<string | null>(null);
|
const [copiedField, setCopiedField] = useState<string | null>(null);
|
||||||
const [stats, setStats] = useState<IndexStats | null>(null);
|
const [stats, setStats] = useState<IndexStats | null>(null);
|
||||||
|
const [copiedLink, setCopiedLink] = useState(false);
|
||||||
|
const [initialLoadDone, setInitialLoadDone] = useState(false);
|
||||||
|
|
||||||
|
const performSearch = useCallback(async (searchQuery: string, updateUrl: boolean = true) => {
|
||||||
|
if (!searchQuery.trim()) return;
|
||||||
|
|
||||||
|
setLoading(true);
|
||||||
|
setError('');
|
||||||
|
setResult(null);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/api/search', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ query: searchQuery.trim() })
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('Search failed');
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
setResult(data);
|
||||||
|
|
||||||
|
// Update URL with search query (using history API to avoid re-triggering effects)
|
||||||
|
if (updateUrl) {
|
||||||
|
const newUrl = new URL(window.location.href);
|
||||||
|
newUrl.searchParams.set('q', searchQuery.trim());
|
||||||
|
window.history.replaceState(null, '', newUrl.pathname + newUrl.search);
|
||||||
|
}
|
||||||
|
} catch (_err) {
|
||||||
|
setError('Failed to perform search. Please check your connection.');
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Load query from URL on mount (only once)
|
||||||
|
useEffect(() => {
|
||||||
|
if (initialLoadDone) return;
|
||||||
|
|
||||||
|
const urlQuery = searchParams.get('q');
|
||||||
|
if (urlQuery) {
|
||||||
|
setQuery(urlQuery);
|
||||||
|
performSearch(urlQuery, false);
|
||||||
|
}
|
||||||
|
setInitialLoadDone(true);
|
||||||
|
}, [searchParams, performSearch, initialLoadDone]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const fetchStats = async () => {
|
const fetchStats = async () => {
|
||||||
@@ -73,30 +123,7 @@ export default function Home() {
|
|||||||
|
|
||||||
const handleSearch = async (e: React.FormEvent) => {
|
const handleSearch = async (e: React.FormEvent) => {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
if (!query.trim()) return;
|
performSearch(query);
|
||||||
|
|
||||||
setLoading(true);
|
|
||||||
setError('');
|
|
||||||
setResult(null);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const response = await fetch('/api/search', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: { 'Content-Type': 'application/json' },
|
|
||||||
body: JSON.stringify({ query: query.trim() })
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!response.ok) {
|
|
||||||
throw new Error('Search failed');
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await response.json();
|
|
||||||
setResult(data);
|
|
||||||
} catch (_err) {
|
|
||||||
setError('Failed to perform search. Please check your connection.');
|
|
||||||
} finally {
|
|
||||||
setLoading(false);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const copyToClipboard = (text: string, field: string) => {
|
const copyToClipboard = (text: string, field: string) => {
|
||||||
@@ -105,6 +132,14 @@ export default function Home() {
|
|||||||
setTimeout(() => setCopiedField(null), 2000);
|
setTimeout(() => setCopiedField(null), 2000);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const copyShareLink = () => {
|
||||||
|
const url = new URL(window.location.href);
|
||||||
|
url.searchParams.set('q', query.trim());
|
||||||
|
navigator.clipboard.writeText(url.toString());
|
||||||
|
setCopiedLink(true);
|
||||||
|
setTimeout(() => setCopiedLink(false), 2000);
|
||||||
|
};
|
||||||
|
|
||||||
const HashDisplay = ({ label, value, field }: { label: string; value: string; field: string }) => (
|
const HashDisplay = ({ label, value, field }: { label: string; value: string; field: string }) => (
|
||||||
<div className="bg-gray-50 rounded-lg p-4 border border-gray-200">
|
<div className="bg-gray-50 rounded-lg p-4 border border-gray-200">
|
||||||
<div className="flex items-center justify-between mb-2">
|
<div className="flex items-center justify-between mb-2">
|
||||||
@@ -166,19 +201,35 @@ export default function Home() {
|
|||||||
value={query}
|
value={query}
|
||||||
onChange={(e) => setQuery(e.target.value)}
|
onChange={(e) => setQuery(e.target.value)}
|
||||||
placeholder="Enter a hash or plaintext..."
|
placeholder="Enter a hash or plaintext..."
|
||||||
className="w-full px-6 py-4 pr-14 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
|
className="w-full px-6 py-4 pr-28 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
|
||||||
/>
|
/>
|
||||||
<button
|
<div className="absolute right-2 top-1/2 -translate-y-1/2 flex gap-1">
|
||||||
type="submit"
|
{query.trim() && (
|
||||||
disabled={loading || !query.trim()}
|
<button
|
||||||
className="absolute right-2 top-1/2 -translate-y-1/2 bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
|
type="button"
|
||||||
>
|
onClick={copyShareLink}
|
||||||
{loading ? (
|
className="bg-gray-100 text-gray-600 p-3 rounded-xl hover:bg-gray-200 transition-all"
|
||||||
<Loader2 className="w-6 h-6 animate-spin" />
|
title="Copy share link"
|
||||||
) : (
|
>
|
||||||
<Search className="w-6 h-6" />
|
{copiedLink ? (
|
||||||
|
<Check className="w-6 h-6 text-green-600" />
|
||||||
|
) : (
|
||||||
|
<Link className="w-6 h-6" />
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
)}
|
)}
|
||||||
</button>
|
<button
|
||||||
|
type="submit"
|
||||||
|
disabled={loading || !query.trim()}
|
||||||
|
className="bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
|
||||||
|
>
|
||||||
|
{loading ? (
|
||||||
|
<Loader2 className="w-6 h-6 animate-spin" />
|
||||||
|
) : (
|
||||||
|
<Search className="w-6 h-6" />
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
@@ -315,3 +366,19 @@ export default function Home() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function LoadingFallback() {
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-gradient-to-br from-blue-50 via-white to-purple-50 flex items-center justify-center">
|
||||||
|
<Loader2 className="w-12 h-12 text-blue-600 animate-spin" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function Home() {
|
||||||
|
return (
|
||||||
|
<Suspense fallback={<LoadingFallback />}>
|
||||||
|
<HasherContent />
|
||||||
|
</Suspense>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -225,50 +225,166 @@ async function findDuplicatesForField(
|
|||||||
return duplicates;
|
return duplicates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
/**
|
||||||
|
* Phase 1: Initialize and connect to Elasticsearch
|
||||||
|
*/
|
||||||
|
async function phase1_InitAndConnect() {
|
||||||
|
console.log(`🔍 Hasher Duplicate Remover - Phase 1: Initialization`);
|
||||||
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||||
|
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
||||||
|
console.log(`Index: ${INDEX_NAME}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||||||
|
|
||||||
|
console.log('🔗 Connecting to Elasticsearch...');
|
||||||
|
await client.cluster.health({});
|
||||||
|
console.log('✅ Connected successfully\n');
|
||||||
|
|
||||||
|
const countResponse = await client.count({ index: INDEX_NAME });
|
||||||
|
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
|
||||||
|
|
||||||
|
return { client, totalDocuments: countResponse.count };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 2: Find duplicates for a specific field
|
||||||
|
*/
|
||||||
|
async function phase2_FindDuplicatesForField(
|
||||||
|
client: Client,
|
||||||
|
field: string,
|
||||||
|
batchSize: number,
|
||||||
|
seenDeleteIds: Set<string>
|
||||||
|
): Promise<{ duplicates: DuplicateGroup[], totalFound: number }> {
|
||||||
|
console.log(`\n🔍 Phase 2: Checking duplicates for field: ${field}...`);
|
||||||
|
|
||||||
|
const fieldDuplicates = await findDuplicatesForField(client, field, batchSize);
|
||||||
|
const duplicates: DuplicateGroup[] = [];
|
||||||
|
|
||||||
|
// Filter out already seen delete IDs to avoid counting the same document multiple times
|
||||||
|
for (const dup of fieldDuplicates) {
|
||||||
|
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
|
||||||
|
if (newDeleteIds.length > 0) {
|
||||||
|
dup.deleteIds = newDeleteIds;
|
||||||
|
newDeleteIds.forEach(id => seenDeleteIds.add(id));
|
||||||
|
duplicates.push(dup);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
||||||
|
console.log(` New unique documents to delete: ${duplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0)}`);
|
||||||
|
|
||||||
|
// Force garbage collection if available
|
||||||
|
if (global.gc) {
|
||||||
|
global.gc();
|
||||||
|
console.log(` ♻️ Memory freed after processing ${field}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { duplicates, totalFound: fieldDuplicates.length };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 3: Process deletion for a batch of duplicates
|
||||||
|
*/
|
||||||
|
async function phase3_DeleteBatch(
|
||||||
|
client: Client,
|
||||||
|
deleteIds: string[],
|
||||||
|
batchSize: number,
|
||||||
|
startIndex: number
|
||||||
|
): Promise<{ deleted: number, errors: number }> {
|
||||||
|
const batch = deleteIds.slice(startIndex, startIndex + batchSize);
|
||||||
|
let deleted = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const bulkOperations = batch.flatMap(id => [
|
||||||
|
{ delete: { _index: INDEX_NAME, _id: id } }
|
||||||
|
]);
|
||||||
|
|
||||||
|
const bulkResponse = await client.bulk({
|
||||||
|
operations: bulkOperations,
|
||||||
|
refresh: false
|
||||||
|
});
|
||||||
|
|
||||||
|
if (bulkResponse.errors) {
|
||||||
|
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
|
||||||
|
errors += errorCount;
|
||||||
|
deleted += batch.length - errorCount;
|
||||||
|
} else {
|
||||||
|
deleted += batch.length;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`\n❌ Error deleting batch:`, error);
|
||||||
|
errors += batch.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force garbage collection if available
|
||||||
|
if (global.gc) {
|
||||||
|
global.gc();
|
||||||
|
}
|
||||||
|
|
||||||
|
return { deleted, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Phase 4: Finalize and report results
|
||||||
|
*/
|
||||||
|
async function phase4_Finalize(
|
||||||
|
client: Client,
|
||||||
|
totalDeleted: number,
|
||||||
|
totalErrors: number,
|
||||||
|
initialDocumentCount: number
|
||||||
|
) {
|
||||||
|
console.log('\n\n🔄 Phase 4: Refreshing index...');
|
||||||
|
await client.indices.refresh({ index: INDEX_NAME });
|
||||||
|
|
||||||
|
const newCountResponse = await client.count({ index: INDEX_NAME });
|
||||||
|
|
||||||
|
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||||
|
console.log('✅ Duplicate removal complete!');
|
||||||
|
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||||
|
console.log(`Documents deleted: ${totalDeleted}`);
|
||||||
|
console.log(`Errors: ${totalErrors}`);
|
||||||
|
console.log(`Previous document count: ${initialDocumentCount}`);
|
||||||
|
console.log(`New document count: ${newCountResponse.count}`);
|
||||||
|
console.log('');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||||
const fields = parsedArgs.field
|
const fields = parsedArgs.field
|
||||||
? [parsedArgs.field]
|
? [parsedArgs.field]
|
||||||
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
||||||
|
|
||||||
console.log(`🔍 Hasher Duplicate Remover`);
|
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
||||||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
|
||||||
console.log(`Index: ${INDEX_NAME}`);
|
|
||||||
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
||||||
console.log(`Batch size: ${parsedArgs.batchSize}`);
|
console.log(`Batch size: ${parsedArgs.batchSize}`);
|
||||||
console.log(`Fields to check: ${fields.join(', ')}`);
|
console.log(`Fields to check: ${fields.join(', ')}`);
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Test connection
|
// === PHASE 1: Initialize ===
|
||||||
console.log('🔗 Connecting to Elasticsearch...');
|
const { client, totalDocuments } = await phase1_InitAndConnect();
|
||||||
await client.cluster.health({});
|
|
||||||
console.log('✅ Connected successfully\n');
|
// Force garbage collection after phase 1
|
||||||
|
if (global.gc) {
|
||||||
// Get index stats
|
global.gc();
|
||||||
const countResponse = await client.count({ index: INDEX_NAME });
|
console.log('♻️ Memory freed after initialization\n');
|
||||||
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
|
}
|
||||||
|
|
||||||
|
// === PHASE 2: Find duplicates field by field ===
|
||||||
const allDuplicates: DuplicateGroup[] = [];
|
const allDuplicates: DuplicateGroup[] = [];
|
||||||
const seenDeleteIds = new Set<string>();
|
const seenDeleteIds = new Set<string>();
|
||||||
|
|
||||||
// Find duplicates for each field
|
|
||||||
for (const field of fields) {
|
for (const field of fields) {
|
||||||
console.log(`🔍 Checking duplicates for field: ${field}...`);
|
const { duplicates } = await phase2_FindDuplicatesForField(
|
||||||
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
|
client,
|
||||||
|
field,
|
||||||
|
parsedArgs.batchSize,
|
||||||
|
seenDeleteIds
|
||||||
|
);
|
||||||
|
allDuplicates.push(...duplicates);
|
||||||
|
|
||||||
// Filter out already seen delete IDs to avoid counting the same document multiple times
|
// Clear field duplicates to free memory
|
||||||
for (const dup of fieldDuplicates) {
|
duplicates.length = 0;
|
||||||
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
|
|
||||||
if (newDeleteIds.length > 0) {
|
|
||||||
dup.deleteIds = newDeleteIds;
|
|
||||||
newDeleteIds.forEach(id => seenDeleteIds.add(id));
|
|
||||||
allDuplicates.push(dup);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
|
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
|
||||||
@@ -310,57 +426,40 @@ async function removeDuplicates(parsedArgs: ParsedArgs) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute deletion
|
// === PHASE 3: Execute deletion in batches ===
|
||||||
console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`);
|
console.log(`\n🗑️ Phase 3: Removing ${totalToDelete} duplicate documents...\n`);
|
||||||
|
|
||||||
let deleted = 0;
|
let totalDeleted = 0;
|
||||||
let errors = 0;
|
let totalErrors = 0;
|
||||||
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
|
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
|
||||||
|
|
||||||
// Delete in batches
|
// Clear allDuplicates to free memory
|
||||||
|
allDuplicates.length = 0;
|
||||||
|
|
||||||
|
// Delete in batches with memory management
|
||||||
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
|
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
|
||||||
const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
|
const { deleted, errors } = await phase3_DeleteBatch(
|
||||||
|
client,
|
||||||
|
deleteIds,
|
||||||
|
parsedArgs.batchSize,
|
||||||
|
i
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
totalDeleted += deleted;
|
||||||
const bulkOperations = batch.flatMap(id => [
|
totalErrors += errors;
|
||||||
{ delete: { _index: INDEX_NAME, _id: id } }
|
|
||||||
]);
|
|
||||||
|
|
||||||
const bulkResponse = await client.bulk({
|
process.stdout.write(
|
||||||
operations: bulkOperations,
|
`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - ` +
|
||||||
refresh: false
|
`Deleted: ${totalDeleted}, Errors: ${totalErrors}`
|
||||||
});
|
);
|
||||||
|
|
||||||
if (bulkResponse.errors) {
|
|
||||||
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
|
|
||||||
errors += errorCount;
|
|
||||||
deleted += batch.length - errorCount;
|
|
||||||
} else {
|
|
||||||
deleted += batch.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`\n❌ Error deleting batch:`, error);
|
|
||||||
errors += batch.length;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refresh index
|
// Clear deleteIds to free memory
|
||||||
console.log('\n\n🔄 Refreshing index...');
|
deleteIds.length = 0;
|
||||||
await client.indices.refresh({ index: INDEX_NAME });
|
seenDeleteIds.clear();
|
||||||
|
|
||||||
// Get new count
|
// === PHASE 4: Finalize ===
|
||||||
const newCountResponse = await client.count({ index: INDEX_NAME });
|
await phase4_Finalize(client, totalDeleted, totalErrors, totalDocuments);
|
||||||
|
|
||||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
||||||
console.log('✅ Duplicate removal complete!');
|
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
||||||
console.log(`Documents deleted: ${deleted}`);
|
|
||||||
console.log(`Errors: ${errors}`);
|
|
||||||
console.log(`Previous document count: ${countResponse.count}`);
|
|
||||||
console.log(`New document count: ${newCountResponse.count}`);
|
|
||||||
console.log('');
|
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
||||||
|
|||||||
Referencia en una nueva incidencia
Block a user