Comparar commits
10 Commits
bb234fef1e
...
elasticsea
| Autor | SHA1 | Fecha | |
|---|---|---|---|
|
b91d19dc0b
|
|||
|
da89037125
|
|||
|
20f0503134
|
|||
|
42bc5a15d0
|
|||
|
2de78b7461
|
|||
|
8fa586731a
|
|||
|
ad7a1cf0a7
|
|||
|
459cdcd9bc
|
|||
|
9c0c30e846
|
|||
|
179e192e82
|
1
API.md
1
API.md
@@ -179,7 +179,6 @@ The API automatically detects hash types based on length and format:
|
||||
| SHA1 | 40 | `^[a-f0-9]{40}$` |
|
||||
| SHA256 | 64 | `^[a-f0-9]{64}$` |
|
||||
| SHA512 | 128 | `^[a-f0-9]{128}$` |
|
||||
| Bcrypt | 60 | `^\$2[abxy]\$` |
|
||||
|
||||
Hashes are case-insensitive.
|
||||
|
||||
|
||||
@@ -10,12 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Added
|
||||
|
||||
#### Core Features
|
||||
- Hash search functionality for MD5, SHA1, SHA256, SHA512, and Bcrypt
|
||||
- Hash search functionality for MD5, SHA1, SHA256, and SHA512
|
||||
- Hash generation from plaintext input
|
||||
- Automatic detection of hash types based on length and pattern
|
||||
- Real-time hash generation with instant results
|
||||
- Copy to clipboard functionality for all hash values
|
||||
- Bcrypt verification support
|
||||
|
||||
#### Backend
|
||||
- Elasticsearch integration with configurable endpoint
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
## ✨ Key Features
|
||||
|
||||
### 🔍 Hash Search
|
||||
- Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
|
||||
- Search for MD5, SHA1, SHA256, and SHA512 hashes
|
||||
- Automatic hash type detection
|
||||
- Case-insensitive matching
|
||||
- Real-time results
|
||||
@@ -174,7 +174,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
|
||||
| SHA1 | 40 | `^[a-f0-9]{40}$` |
|
||||
| SHA256 | 64 | `^[a-f0-9]{64}$` |
|
||||
| SHA512 | 128 | `^[a-f0-9]{128}$` |
|
||||
| Bcrypt | 60 | `^\$2[abxy]\$` |
|
||||
|
||||
---
|
||||
|
||||
@@ -245,7 +244,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
|
||||
## 📈 Future Enhancements
|
||||
|
||||
### Planned Features
|
||||
- Bcrypt hash validation
|
||||
- Argon2 hash support
|
||||
- Search history
|
||||
- Batch lookup
|
||||
|
||||
@@ -25,7 +25,6 @@ npm run index-file -- --help # Show help
|
||||
| SHA1 | 40 | `5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8` |
|
||||
| SHA256 | 64 | `5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8` |
|
||||
| SHA512 | 128 | `b109f3bbbc244eb82441917ed06d618b9008dd09b3befd1b5e07394c706a8bb9...` |
|
||||
| Bcrypt | 60 | `$2b$10$N9qo8uLOickgx2ZMRZoMye...` |
|
||||
|
||||
## 🔌 API Quick Reference
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
|
||||
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, and SHA512 hashes
|
||||
- 🔑 **Hash Generation**: Generate multiple hash types from plaintext
|
||||
- 💾 **Auto-Indexing**: Automatically stores searched plaintext and hashes
|
||||
- 📊 **Elasticsearch Backend**: Scalable storage with 10 shards for performance
|
||||
@@ -274,7 +274,6 @@ npm run lint
|
||||
| SHA1 | 40 | `^[a-f0-9]{40}$` |
|
||||
| SHA256 | 64 | `^[a-f0-9]{64}$` |
|
||||
| SHA512 | 128 | `^[a-f0-9]{128}$` |
|
||||
| Bcrypt | 60 | `^\$2[abxy]\$` |
|
||||
|
||||
## 🚀 Performance
|
||||
|
||||
|
||||
@@ -8,17 +8,104 @@ interface HashDocument {
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
bcrypt: string;
|
||||
created_at?: string;
|
||||
}
|
||||
|
||||
// Maximum allowed query length
|
||||
const MAX_QUERY_LENGTH = 1000;
|
||||
|
||||
// Characters that could be used in NoSQL/Elasticsearch injection attacks
|
||||
const DANGEROUS_PATTERNS = [
|
||||
/[{}\[\]]/g, // JSON structure characters
|
||||
/\$[a-zA-Z]/g, // MongoDB-style operators
|
||||
/\\u[0-9a-fA-F]{4}/g, // Unicode escapes
|
||||
/<script/gi, // XSS attempts
|
||||
/javascript:/gi, // XSS attempts
|
||||
];
|
||||
|
||||
/**
|
||||
* Sanitize input to prevent NoSQL injection attacks
|
||||
* For hash lookups, we only need alphanumeric characters and $
|
||||
* For plaintext, we allow more characters but sanitize dangerous patterns
|
||||
*/
|
||||
function sanitizeInput(input: string): string {
|
||||
// Trim and take first word only
|
||||
let sanitized = input.trim().split(/\s+/)[0] || '';
|
||||
|
||||
// Limit length
|
||||
if (sanitized.length > MAX_QUERY_LENGTH) {
|
||||
sanitized = sanitized.substring(0, MAX_QUERY_LENGTH);
|
||||
}
|
||||
|
||||
// Remove null bytes
|
||||
sanitized = sanitized.replace(/\0/g, '');
|
||||
|
||||
// Check for dangerous patterns
|
||||
for (const pattern of DANGEROUS_PATTERNS) {
|
||||
sanitized = sanitized.replace(pattern, '');
|
||||
}
|
||||
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate that the input is safe for use in Elasticsearch queries
|
||||
*/
|
||||
function isValidInput(input: string): boolean {
|
||||
// Check for empty input
|
||||
if (!input || input.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for excessively long input
|
||||
if (input.length > MAX_QUERY_LENGTH) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for control characters (except normal whitespace)
|
||||
if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(input)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
const { query } = await request.json();
|
||||
const body = await request.json();
|
||||
|
||||
// Validate request body structure
|
||||
if (!body || typeof body !== 'object') {
|
||||
return NextResponse.json(
|
||||
{ error: 'Invalid request body' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const { query } = body;
|
||||
|
||||
// Validate query type
|
||||
if (!query || typeof query !== 'string') {
|
||||
return NextResponse.json(
|
||||
{ error: 'Query parameter is required' },
|
||||
{ error: 'Query parameter is required and must be a string' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
// Validate input before processing
|
||||
if (!isValidInput(query)) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Invalid query: contains forbidden characters or is too long' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
// Sanitize input
|
||||
const cleanQuery = sanitizeInput(query);
|
||||
|
||||
if (!cleanQuery) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Invalid query: only whitespace or invalid characters provided' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
@@ -26,15 +113,6 @@ export async function POST(request: NextRequest) {
|
||||
// Ensure index exists
|
||||
await initializeIndex();
|
||||
|
||||
const cleanQuery = query.trim().split(/\s+/)[0];
|
||||
|
||||
if (!cleanQuery) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Invalid query: only whitespace provided' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const cleanQueryLower = cleanQuery.toLowerCase();
|
||||
const hashType = detectHashType(cleanQueryLower);
|
||||
|
||||
@@ -44,7 +122,7 @@ export async function POST(request: NextRequest) {
|
||||
index: INDEX_NAME,
|
||||
query: {
|
||||
term: {
|
||||
[hashType]: hashType === 'bcrypt' ? cleanQuery : cleanQueryLower
|
||||
[hashType]: cleanQueryLower
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -66,7 +144,6 @@ export async function POST(request: NextRequest) {
|
||||
sha1: source.sha1,
|
||||
sha256: source.sha256,
|
||||
sha512: source.sha512,
|
||||
bcrypt: source.bcrypt,
|
||||
}
|
||||
};
|
||||
})
|
||||
@@ -101,11 +178,10 @@ export async function POST(request: NextRequest) {
|
||||
sha1: existingDoc.sha1,
|
||||
sha256: existingDoc.sha256,
|
||||
sha512: existingDoc.sha512,
|
||||
bcrypt: existingDoc.bcrypt,
|
||||
};
|
||||
} else {
|
||||
// Plaintext not found, generate hashes and check if any hash already exists
|
||||
hashes = await generateHashes(cleanQuery);
|
||||
hashes = generateHashes(cleanQuery);
|
||||
|
||||
const hashExistsResponse = await esClient.search<HashDocument>({
|
||||
index: INDEX_NAME,
|
||||
@@ -147,7 +223,6 @@ export async function POST(request: NextRequest) {
|
||||
sha1: hashes.sha1,
|
||||
sha256: hashes.sha256,
|
||||
sha512: hashes.sha512,
|
||||
bcrypt: hashes.bcrypt,
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@ const geistMono = Geist_Mono({
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: "Hasher - Hash Search & Generator",
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt. Powered by Elasticsearch.",
|
||||
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "bcrypt", "hash generator", "hash search", "elasticsearch"],
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512. Powered by Elasticsearch.",
|
||||
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "hash generator", "hash search", "elasticsearch"],
|
||||
authors: [{ name: "Hasher" }],
|
||||
creator: "Hasher",
|
||||
publisher: "Hasher",
|
||||
@@ -28,7 +28,7 @@ export const metadata: Metadata = {
|
||||
openGraph: {
|
||||
type: "website",
|
||||
title: "Hasher - Hash Search & Generator",
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
|
||||
siteName: "Hasher",
|
||||
images: [
|
||||
{
|
||||
@@ -42,7 +42,7 @@ export const metadata: Metadata = {
|
||||
twitter: {
|
||||
card: "summary",
|
||||
title: "Hasher - Hash Search & Generator",
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
|
||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
|
||||
images: ["/logo.png"],
|
||||
},
|
||||
viewport: {
|
||||
|
||||
141
app/page.tsx
141
app/page.tsx
@@ -1,7 +1,8 @@
|
||||
'use client';
|
||||
|
||||
import { useState } from 'react';
|
||||
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2 } from 'lucide-react';
|
||||
import { useState, useEffect, useCallback, Suspense } from 'react';
|
||||
import { useSearchParams } from 'next/navigation';
|
||||
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2, Database, Link } from 'lucide-react';
|
||||
|
||||
interface SearchResult {
|
||||
found: boolean;
|
||||
@@ -15,7 +16,6 @@ interface SearchResult {
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
bcrypt: string;
|
||||
};
|
||||
results?: Array<{
|
||||
plaintext: string;
|
||||
@@ -24,22 +24,41 @@ interface SearchResult {
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
bcrypt: string;
|
||||
};
|
||||
}>;
|
||||
message?: string;
|
||||
}
|
||||
|
||||
export default function Home() {
|
||||
interface IndexStats {
|
||||
documentCount: number;
|
||||
indexSize: number;
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes === 0) return '0 B';
|
||||
const k = 1024;
|
||||
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
|
||||
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
||||
}
|
||||
|
||||
function formatNumber(num: number): string {
|
||||
return num.toLocaleString();
|
||||
}
|
||||
|
||||
function HasherContent() {
|
||||
const searchParams = useSearchParams();
|
||||
const [query, setQuery] = useState('');
|
||||
const [result, setResult] = useState<SearchResult | null>(null);
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [error, setError] = useState('');
|
||||
const [copiedField, setCopiedField] = useState<string | null>(null);
|
||||
const [stats, setStats] = useState<IndexStats | null>(null);
|
||||
const [copiedLink, setCopiedLink] = useState(false);
|
||||
const [initialLoadDone, setInitialLoadDone] = useState(false);
|
||||
|
||||
const handleSearch = async (e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
if (!query.trim()) return;
|
||||
const performSearch = useCallback(async (searchQuery: string, updateUrl: boolean = true) => {
|
||||
if (!searchQuery.trim()) return;
|
||||
|
||||
setLoading(true);
|
||||
setError('');
|
||||
@@ -49,7 +68,7 @@ export default function Home() {
|
||||
const response = await fetch('/api/search', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ query: query.trim() })
|
||||
body: JSON.stringify({ query: searchQuery.trim() })
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
@@ -58,11 +77,53 @@ export default function Home() {
|
||||
|
||||
const data = await response.json();
|
||||
setResult(data);
|
||||
|
||||
// Update URL with search query (using history API to avoid re-triggering effects)
|
||||
if (updateUrl) {
|
||||
const newUrl = new URL(window.location.href);
|
||||
newUrl.searchParams.set('q', searchQuery.trim());
|
||||
window.history.replaceState(null, '', newUrl.pathname + newUrl.search);
|
||||
}
|
||||
} catch (_err) {
|
||||
setError('Failed to perform search. Please check your connection.');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
// Load query from URL on mount (only once)
|
||||
useEffect(() => {
|
||||
if (initialLoadDone) return;
|
||||
|
||||
const urlQuery = searchParams.get('q');
|
||||
if (urlQuery) {
|
||||
setQuery(urlQuery);
|
||||
performSearch(urlQuery, false);
|
||||
}
|
||||
setInitialLoadDone(true);
|
||||
}, [searchParams, performSearch, initialLoadDone]);
|
||||
|
||||
useEffect(() => {
|
||||
const fetchStats = async () => {
|
||||
try {
|
||||
const response = await fetch('/api/health');
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
if (data.index?.stats) {
|
||||
setStats(data.index.stats);
|
||||
}
|
||||
}
|
||||
} catch (_err) {
|
||||
// Silently fail - stats are not critical
|
||||
}
|
||||
};
|
||||
|
||||
fetchStats();
|
||||
}, [result]); // Refresh stats after each search result
|
||||
|
||||
const handleSearch = async (e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
performSearch(query);
|
||||
};
|
||||
|
||||
const copyToClipboard = (text: string, field: string) => {
|
||||
@@ -71,6 +132,14 @@ export default function Home() {
|
||||
setTimeout(() => setCopiedField(null), 2000);
|
||||
};
|
||||
|
||||
const copyShareLink = () => {
|
||||
const url = new URL(window.location.href);
|
||||
url.searchParams.set('q', query.trim());
|
||||
navigator.clipboard.writeText(url.toString());
|
||||
setCopiedLink(true);
|
||||
setTimeout(() => setCopiedLink(false), 2000);
|
||||
};
|
||||
|
||||
const HashDisplay = ({ label, value, field }: { label: string; value: string; field: string }) => (
|
||||
<div className="bg-gray-50 rounded-lg p-4 border border-gray-200">
|
||||
<div className="flex items-center justify-between mb-2">
|
||||
@@ -108,8 +177,20 @@ export default function Home() {
|
||||
Search for hashes or generate them from plaintext
|
||||
</p>
|
||||
<p className="text-sm text-gray-500 mt-2">
|
||||
Supports MD5, SHA1, SHA256, SHA512, and Bcrypt
|
||||
Supports MD5, SHA1, SHA256, and SHA512
|
||||
</p>
|
||||
{stats && (
|
||||
<div className="flex items-center justify-center gap-4 mt-4 text-sm text-gray-500">
|
||||
<div className="flex items-center gap-1.5">
|
||||
<Database className="w-4 h-4" />
|
||||
<span><strong>{formatNumber(stats.documentCount)}</strong> hashes</span>
|
||||
</div>
|
||||
<span className="text-gray-300">•</span>
|
||||
<div>
|
||||
<span><strong>{formatBytes(stats.indexSize)}</strong> indexed</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Search Form */}
|
||||
@@ -120,12 +201,27 @@ export default function Home() {
|
||||
value={query}
|
||||
onChange={(e) => setQuery(e.target.value)}
|
||||
placeholder="Enter a hash or plaintext..."
|
||||
className="w-full px-6 py-4 pr-14 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
|
||||
className="w-full px-6 py-4 pr-28 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
|
||||
/>
|
||||
<div className="absolute right-2 top-1/2 -translate-y-1/2 flex gap-1">
|
||||
{query.trim() && (
|
||||
<button
|
||||
type="button"
|
||||
onClick={copyShareLink}
|
||||
className="bg-gray-100 text-gray-600 p-3 rounded-xl hover:bg-gray-200 transition-all"
|
||||
title="Copy share link"
|
||||
>
|
||||
{copiedLink ? (
|
||||
<Check className="w-6 h-6 text-green-600" />
|
||||
) : (
|
||||
<Link className="w-6 h-6" />
|
||||
)}
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading || !query.trim()}
|
||||
className="absolute right-2 top-1/2 -translate-y-1/2 bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
|
||||
className="bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
|
||||
>
|
||||
{loading ? (
|
||||
<Loader2 className="w-6 h-6 animate-spin" />
|
||||
@@ -134,6 +230,7 @@ export default function Home() {
|
||||
)}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
{/* Error Message */}
|
||||
@@ -166,7 +263,6 @@ export default function Home() {
|
||||
<HashDisplay label="SHA1" value={result.hashes!.sha1} field="sha1-gen" />
|
||||
<HashDisplay label="SHA256" value={result.hashes!.sha256} field="sha256-gen" />
|
||||
<HashDisplay label="SHA512" value={result.hashes!.sha512} field="sha512-gen" />
|
||||
<HashDisplay label="Bcrypt" value={result.hashes!.bcrypt} field="bcrypt-gen" />
|
||||
</div>
|
||||
{result.wasGenerated && (
|
||||
<div className="mt-6 bg-blue-50 border border-blue-200 rounded-xl p-4">
|
||||
@@ -212,7 +308,6 @@ export default function Home() {
|
||||
<HashDisplay label="SHA1" value={item.hashes.sha1} field={`sha1-${idx}`} />
|
||||
<HashDisplay label="SHA256" value={item.hashes.sha256} field={`sha256-${idx}`} />
|
||||
<HashDisplay label="SHA512" value={item.hashes.sha512} field={`sha512-${idx}`} />
|
||||
<HashDisplay label="Bcrypt" value={item.hashes.bcrypt} field={`bcrypt-${idx}`} />
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
@@ -256,7 +351,7 @@ export default function Home() {
|
||||
</div>
|
||||
<h3 className="text-xl font-bold text-gray-900 mb-2">Generate Hashes</h3>
|
||||
<p className="text-gray-600">
|
||||
Enter any plaintext to instantly generate MD5, SHA1, SHA256, SHA512, and Bcrypt hashes. Results are saved automatically.
|
||||
Enter any plaintext to instantly generate MD5, SHA1, SHA256, and SHA512 hashes. Results are saved automatically.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
@@ -271,3 +366,19 @@ export default function Home() {
|
||||
);
|
||||
}
|
||||
|
||||
function LoadingFallback() {
|
||||
return (
|
||||
<div className="min-h-screen bg-gradient-to-br from-blue-50 via-white to-purple-50 flex items-center justify-center">
|
||||
<Loader2 className="w-12 h-12 text-blue-600 animate-spin" />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default function Home() {
|
||||
return (
|
||||
<Suspense fallback={<LoadingFallback />}>
|
||||
<HasherContent />
|
||||
</Suspense>
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -46,9 +46,6 @@ export const INDEX_MAPPING = {
|
||||
sha512: {
|
||||
type: 'keyword' as const
|
||||
},
|
||||
bcrypt: {
|
||||
type: 'keyword' as const
|
||||
},
|
||||
created_at: {
|
||||
type: 'date' as const
|
||||
}
|
||||
|
||||
23
lib/hash.ts
23
lib/hash.ts
@@ -1,5 +1,4 @@
|
||||
import crypto from 'crypto';
|
||||
import bcrypt from 'bcrypt';
|
||||
|
||||
export interface HashResult {
|
||||
plaintext: string;
|
||||
@@ -7,22 +6,18 @@ export interface HashResult {
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
bcrypt: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate all common hashes for a given plaintext
|
||||
*/
|
||||
export async function generateHashes(plaintext: string): Promise<HashResult> {
|
||||
const bcryptHash = await bcrypt.hash(plaintext, 10);
|
||||
|
||||
export function generateHashes(plaintext: string): HashResult {
|
||||
return {
|
||||
plaintext,
|
||||
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
|
||||
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
|
||||
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
|
||||
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
|
||||
bcrypt: bcryptHash,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -52,11 +47,6 @@ export function detectHashType(hash: string): string | null {
|
||||
return 'sha512';
|
||||
}
|
||||
|
||||
// BCrypt: starts with $2a$, $2b$, $2x$, or $2y$
|
||||
if (/^\$2[abxy]\$/.test(cleanHash)) {
|
||||
return 'bcrypt';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -66,14 +56,3 @@ export function detectHashType(hash: string): string | null {
|
||||
export function isHash(input: string): boolean {
|
||||
return detectHashType(input) !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify a plaintext against a bcrypt hash
|
||||
*/
|
||||
export async function verifyBcrypt(plaintext: string, hash: string): Promise<boolean> {
|
||||
try {
|
||||
return await bcrypt.compare(plaintext, hash);
|
||||
} catch (_error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,12 +34,11 @@
|
||||
"build": "next build",
|
||||
"start": "next start",
|
||||
"lint": "eslint",
|
||||
"index-file": "tsx scripts/index-file.ts"
|
||||
"index-file": "tsx scripts/index-file.ts",
|
||||
"remove-duplicates": "tsx scripts/remove-duplicates.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@elastic/elasticsearch": "^9.2.0",
|
||||
"@types/bcrypt": "^6.0.0",
|
||||
"bcrypt": "^6.0.0",
|
||||
"lucide-react": "^0.555.0",
|
||||
"next": "15.4.8",
|
||||
"react": "19.1.2",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Hasher - Hash Search & Generator",
|
||||
"short_name": "Hasher",
|
||||
"description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
|
||||
"description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
|
||||
"start_url": "/",
|
||||
"display": "standalone",
|
||||
"background_color": "#ffffff",
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
* --batch-size=<number> Number of items to process in each batch (default: 100)
|
||||
* --resume Resume from last saved state (default: true)
|
||||
* --no-resume Start from beginning, ignore saved state
|
||||
* --no-check Skip duplicate checking (faster, but may create duplicates)
|
||||
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
|
||||
* --help, -h Show this help message
|
||||
*/
|
||||
@@ -34,7 +35,6 @@ interface HashDocument {
|
||||
sha1: string;
|
||||
sha256: string;
|
||||
sha512: string;
|
||||
bcrypt: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
@@ -54,6 +54,7 @@ interface ParsedArgs {
|
||||
filePath: string | null;
|
||||
batchSize: number;
|
||||
resume: boolean;
|
||||
checkDuplicates: boolean;
|
||||
stateFile: string | null;
|
||||
showHelp: boolean;
|
||||
}
|
||||
@@ -63,6 +64,7 @@ function parseArgs(args: string[]): ParsedArgs {
|
||||
filePath: null,
|
||||
batchSize: DEFAULT_BATCH_SIZE,
|
||||
resume: true,
|
||||
checkDuplicates: true,
|
||||
stateFile: null,
|
||||
showHelp: false
|
||||
};
|
||||
@@ -76,6 +78,8 @@ function parseArgs(args: string[]): ParsedArgs {
|
||||
result.resume = true;
|
||||
} else if (arg === '--no-resume') {
|
||||
result.resume = false;
|
||||
} else if (arg === '--no-check') {
|
||||
result.checkDuplicates = false;
|
||||
} else if (arg.startsWith('--batch-size=')) {
|
||||
const value = arg.split('=')[1];
|
||||
const parsed = parseInt(value, 10);
|
||||
@@ -152,17 +156,13 @@ function deleteState(stateFile: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
async function generateHashes(plaintext: string): Promise<HashDocument> {
|
||||
const bcrypt = await import('bcrypt');
|
||||
const bcryptHash = await bcrypt.default.hash(plaintext, 10);
|
||||
|
||||
function generateHashes(plaintext: string): HashDocument {
|
||||
return {
|
||||
plaintext,
|
||||
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
|
||||
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
|
||||
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
|
||||
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
|
||||
bcrypt: bcryptHash,
|
||||
created_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
@@ -180,6 +180,7 @@ Options:
|
||||
--batch-size <number> Alternative syntax for batch size
|
||||
--resume Resume from last saved state (default)
|
||||
--no-resume Start from beginning, ignore saved state
|
||||
--no-check Skip duplicate checking (faster, but may create duplicates)
|
||||
--state-file=<path> Custom state file path
|
||||
--help, -h Show this help message
|
||||
|
||||
@@ -191,17 +192,23 @@ Examples:
|
||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
|
||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
|
||||
npx tsx scripts/index-file.ts wordlist.txt --no-resume
|
||||
npm run index-file -- wordlist.txt --batch-size=500
|
||||
npx tsx scripts/index-file.ts wordlist.txt --no-check
|
||||
npm run index-file -- wordlist.txt --batch-size=500 --no-check
|
||||
|
||||
State Management:
|
||||
The script automatically saves progress to a state file. If interrupted,
|
||||
it will resume from where it left off on the next run. Use --no-resume
|
||||
to start fresh.
|
||||
|
||||
Duplicate Checking:
|
||||
By default, the script checks if each plaintext or hash already exists
|
||||
in the index before inserting. Use --no-check to skip this verification
|
||||
for faster indexing (useful when you're sure there are no duplicates).
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
|
||||
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
|
||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||||
const absolutePath = resolve(filePath);
|
||||
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
|
||||
@@ -244,6 +251,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
||||
console.log(`Index: ${INDEX_NAME}`);
|
||||
console.log(`File: ${filePath}`);
|
||||
console.log(`Batch size: ${batchSize}`);
|
||||
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
|
||||
console.log(`State file: ${stateFile}`);
|
||||
if (resumingFrom > 0) {
|
||||
console.log(`Resuming from: line ${resumingFrom}`);
|
||||
@@ -300,13 +308,12 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
||||
const bulkOperations: any[] = [];
|
||||
|
||||
// Generate hashes for all items in batch first
|
||||
const batchWithHashes = await Promise.all(
|
||||
batch.map(async (plaintext: string) => ({
|
||||
const batchWithHashes = batch.map((plaintext: string) => ({
|
||||
plaintext,
|
||||
hashes: await generateHashes(plaintext)
|
||||
}))
|
||||
);
|
||||
hashes: generateHashes(plaintext)
|
||||
}));
|
||||
|
||||
if (checkDuplicates) {
|
||||
// Check which items already exist (by plaintext or any hash)
|
||||
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
||||
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
||||
@@ -343,7 +350,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
||||
});
|
||||
|
||||
// Prepare bulk operations only for items that don't have any duplicate hash
|
||||
let batchSkipped = 0;
|
||||
for (const item of batchWithHashes) {
|
||||
const isDuplicate =
|
||||
existingHashes.has(item.plaintext) ||
|
||||
@@ -356,11 +362,17 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
||||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
||||
bulkOperations.push(item.hashes);
|
||||
} else {
|
||||
batchSkipped++;
|
||||
state.skipped++;
|
||||
sessionSkipped++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No duplicate checking - index everything
|
||||
for (const item of batchWithHashes) {
|
||||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
||||
bulkOperations.push(item.hashes);
|
||||
}
|
||||
}
|
||||
|
||||
// Execute bulk operation only if there are new items to insert
|
||||
if (bulkOperations.length > 0) {
|
||||
@@ -498,9 +510,10 @@ console.log(`\n🔧 Configuration:`);
|
||||
console.log(` File: ${filePath}`);
|
||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
||||
console.log(` Resume: ${parsedArgs.resume}`);
|
||||
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
|
||||
if (parsedArgs.stateFile) {
|
||||
console.log(` State file: ${parsedArgs.stateFile}`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
|
||||
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);
|
||||
|
||||
496
scripts/remove-duplicates.ts
Archivo normal
496
scripts/remove-duplicates.ts
Archivo normal
@@ -0,0 +1,496 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Hasher Duplicate Remover Script
|
||||
*
|
||||
* This script finds and removes duplicate entries from the Elasticsearch index.
|
||||
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/remove-duplicates.ts [options]
|
||||
* npm run remove-duplicates [-- options]
|
||||
*
|
||||
* Options:
|
||||
* --dry-run Show duplicates without removing them (default)
|
||||
* --execute Actually remove the duplicates
|
||||
* --batch-size=<number> Number of items to process in each batch (default: 1000)
|
||||
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
|
||||
* --help, -h Show this help message
|
||||
*/
|
||||
|
||||
import { Client } from '@elastic/elasticsearch';
|
||||
|
||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
||||
const INDEX_NAME = 'hasher';
|
||||
const DEFAULT_BATCH_SIZE = 1000;
|
||||
|
||||
interface ParsedArgs {
|
||||
dryRun: boolean;
|
||||
batchSize: number;
|
||||
field: string | null;
|
||||
showHelp: boolean;
|
||||
}
|
||||
|
||||
interface DuplicateGroup {
|
||||
value: string;
|
||||
field: string;
|
||||
documentIds: string[];
|
||||
keepId: string;
|
||||
deleteIds: string[];
|
||||
}
|
||||
|
||||
function parseArgs(args: string[]): ParsedArgs {
|
||||
const result: ParsedArgs = {
|
||||
dryRun: true,
|
||||
batchSize: DEFAULT_BATCH_SIZE,
|
||||
field: null,
|
||||
showHelp: false
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
|
||||
if (arg === '--help' || arg === '-h') {
|
||||
result.showHelp = true;
|
||||
} else if (arg === '--dry-run') {
|
||||
result.dryRun = true;
|
||||
} else if (arg === '--execute') {
|
||||
result.dryRun = false;
|
||||
} else if (arg.startsWith('--batch-size=')) {
|
||||
const value = arg.split('=')[1];
|
||||
const parsed = parseInt(value, 10);
|
||||
if (!isNaN(parsed) && parsed > 0) {
|
||||
result.batchSize = parsed;
|
||||
}
|
||||
} else if (arg === '--batch-size') {
|
||||
const nextArg = args[i + 1];
|
||||
if (nextArg && !nextArg.startsWith('-')) {
|
||||
const parsed = parseInt(nextArg, 10);
|
||||
if (!isNaN(parsed) && parsed > 0) {
|
||||
result.batchSize = parsed;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
} else if (arg.startsWith('--field=')) {
|
||||
result.field = arg.split('=')[1];
|
||||
} else if (arg === '--field') {
|
||||
const nextArg = args[i + 1];
|
||||
if (nextArg && !nextArg.startsWith('-')) {
|
||||
result.field = nextArg;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function showHelp() {
|
||||
console.log(`
|
||||
Hasher Duplicate Remover Script
|
||||
|
||||
Usage:
|
||||
npx tsx scripts/remove-duplicates.ts [options]
|
||||
npm run remove-duplicates [-- options]
|
||||
|
||||
Options:
|
||||
--dry-run Show duplicates without removing them (default)
|
||||
--execute Actually remove the duplicates
|
||||
--batch-size=<number> Number of items to process in each batch (default: 1000)
|
||||
--field=<field> Check duplicates only on this field
|
||||
Valid fields: plaintext, md5, sha1, sha256, sha512
|
||||
--help, -h Show this help message
|
||||
|
||||
Environment Variables:
|
||||
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
|
||||
|
||||
Examples:
|
||||
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
|
||||
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
|
||||
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
|
||||
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
|
||||
|
||||
Notes:
|
||||
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
|
||||
- Always run with --dry-run first to review what will be deleted
|
||||
- Duplicates are checked across all hash fields by default
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
async function findDuplicatesForField(
|
||||
client: Client,
|
||||
field: string,
|
||||
batchSize: number
|
||||
): Promise<DuplicateGroup[]> {
|
||||
const duplicates: DuplicateGroup[] = [];
|
||||
|
||||
// Use aggregation to find duplicate values
|
||||
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
|
||||
|
||||
// Use composite aggregation to handle large number of duplicates
|
||||
let afterKey: any = undefined;
|
||||
let hasMore = true;
|
||||
|
||||
console.log(` Scanning for duplicates...`);
|
||||
|
||||
while (hasMore) {
|
||||
const aggQuery: any = {
|
||||
index: INDEX_NAME,
|
||||
size: 0,
|
||||
aggs: {
|
||||
duplicates: {
|
||||
composite: {
|
||||
size: batchSize,
|
||||
sources: [
|
||||
{ value: { terms: { field: fieldToAggregate } } }
|
||||
],
|
||||
...(afterKey && { after: afterKey })
|
||||
},
|
||||
aggs: {
|
||||
doc_count_filter: {
|
||||
bucket_selector: {
|
||||
buckets_path: { count: '_count' },
|
||||
script: 'params.count > 1'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const response = await client.search(aggQuery);
|
||||
const compositeAgg = response.aggregations?.duplicates as any;
|
||||
const buckets = compositeAgg?.buckets || [];
|
||||
|
||||
for (const bucket of buckets) {
|
||||
if (bucket.doc_count > 1) {
|
||||
const value = bucket.key.value;
|
||||
|
||||
// Use scroll API for large result sets
|
||||
const documentIds: string[] = [];
|
||||
|
||||
let scrollResponse = await client.search({
|
||||
index: INDEX_NAME,
|
||||
scroll: '1m',
|
||||
size: 1000,
|
||||
query: {
|
||||
term: {
|
||||
[fieldToAggregate]: value
|
||||
}
|
||||
},
|
||||
sort: [
|
||||
{ created_at: { order: 'asc' } }
|
||||
],
|
||||
_source: false
|
||||
});
|
||||
|
||||
while (scrollResponse.hits.hits.length > 0) {
|
||||
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
|
||||
|
||||
if (!scrollResponse._scroll_id) break;
|
||||
|
||||
scrollResponse = await client.scroll({
|
||||
scroll_id: scrollResponse._scroll_id,
|
||||
scroll: '1m'
|
||||
});
|
||||
}
|
||||
|
||||
// Clear scroll
|
||||
if (scrollResponse._scroll_id) {
|
||||
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
|
||||
}
|
||||
|
||||
if (documentIds.length > 1) {
|
||||
duplicates.push({
|
||||
value: String(value),
|
||||
field,
|
||||
documentIds,
|
||||
keepId: documentIds[0], // Keep the oldest
|
||||
deleteIds: documentIds.slice(1) // Delete the rest
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if there are more results
|
||||
afterKey = compositeAgg?.after_key;
|
||||
hasMore = buckets.length === batchSize && afterKey;
|
||||
|
||||
if (hasMore) {
|
||||
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
|
||||
}
|
||||
}
|
||||
|
||||
return duplicates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 1: Initialize and connect to Elasticsearch
|
||||
*/
|
||||
async function phase1_InitAndConnect() {
|
||||
console.log(`🔍 Hasher Duplicate Remover - Phase 1: Initialization`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
||||
console.log(`Index: ${INDEX_NAME}`);
|
||||
console.log('');
|
||||
|
||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
||||
|
||||
console.log('🔗 Connecting to Elasticsearch...');
|
||||
await client.cluster.health({});
|
||||
console.log('✅ Connected successfully\n');
|
||||
|
||||
const countResponse = await client.count({ index: INDEX_NAME });
|
||||
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
|
||||
|
||||
return { client, totalDocuments: countResponse.count };
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 2: Find duplicates for a specific field
|
||||
*/
|
||||
async function phase2_FindDuplicatesForField(
|
||||
client: Client,
|
||||
field: string,
|
||||
batchSize: number,
|
||||
seenDeleteIds: Set<string>
|
||||
): Promise<{ duplicates: DuplicateGroup[], totalFound: number }> {
|
||||
console.log(`\n🔍 Phase 2: Checking duplicates for field: ${field}...`);
|
||||
|
||||
const fieldDuplicates = await findDuplicatesForField(client, field, batchSize);
|
||||
const duplicates: DuplicateGroup[] = [];
|
||||
|
||||
// Filter out already seen delete IDs to avoid counting the same document multiple times
|
||||
for (const dup of fieldDuplicates) {
|
||||
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
|
||||
if (newDeleteIds.length > 0) {
|
||||
dup.deleteIds = newDeleteIds;
|
||||
newDeleteIds.forEach(id => seenDeleteIds.add(id));
|
||||
duplicates.push(dup);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
||||
console.log(` New unique documents to delete: ${duplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0)}`);
|
||||
|
||||
// Force garbage collection if available
|
||||
if (global.gc) {
|
||||
global.gc();
|
||||
console.log(` ♻️ Memory freed after processing ${field}`);
|
||||
}
|
||||
|
||||
return { duplicates, totalFound: fieldDuplicates.length };
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 3: Process deletion for a batch of duplicates
|
||||
*/
|
||||
async function phase3_DeleteBatch(
|
||||
client: Client,
|
||||
deleteIds: string[],
|
||||
batchSize: number,
|
||||
startIndex: number
|
||||
): Promise<{ deleted: number, errors: number }> {
|
||||
const batch = deleteIds.slice(startIndex, startIndex + batchSize);
|
||||
let deleted = 0;
|
||||
let errors = 0;
|
||||
|
||||
try {
|
||||
const bulkOperations = batch.flatMap(id => [
|
||||
{ delete: { _index: INDEX_NAME, _id: id } }
|
||||
]);
|
||||
|
||||
const bulkResponse = await client.bulk({
|
||||
operations: bulkOperations,
|
||||
refresh: false
|
||||
});
|
||||
|
||||
if (bulkResponse.errors) {
|
||||
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
|
||||
errors += errorCount;
|
||||
deleted += batch.length - errorCount;
|
||||
} else {
|
||||
deleted += batch.length;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`\n❌ Error deleting batch:`, error);
|
||||
errors += batch.length;
|
||||
}
|
||||
|
||||
// Force garbage collection if available
|
||||
if (global.gc) {
|
||||
global.gc();
|
||||
}
|
||||
|
||||
return { deleted, errors };
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 4: Finalize and report results
|
||||
*/
|
||||
async function phase4_Finalize(
|
||||
client: Client,
|
||||
totalDeleted: number,
|
||||
totalErrors: number,
|
||||
initialDocumentCount: number
|
||||
) {
|
||||
console.log('\n\n🔄 Phase 4: Refreshing index...');
|
||||
await client.indices.refresh({ index: INDEX_NAME });
|
||||
|
||||
const newCountResponse = await client.count({ index: INDEX_NAME });
|
||||
|
||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log('✅ Duplicate removal complete!');
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Documents deleted: ${totalDeleted}`);
|
||||
console.log(`Errors: ${totalErrors}`);
|
||||
console.log(`Previous document count: ${initialDocumentCount}`);
|
||||
console.log(`New document count: ${newCountResponse.count}`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
||||
const fields = parsedArgs.field
|
||||
? [parsedArgs.field]
|
||||
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
||||
|
||||
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
||||
console.log(`Batch size: ${parsedArgs.batchSize}`);
|
||||
console.log(`Fields to check: ${fields.join(', ')}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// === PHASE 1: Initialize ===
|
||||
const { client, totalDocuments } = await phase1_InitAndConnect();
|
||||
|
||||
// Force garbage collection after phase 1
|
||||
if (global.gc) {
|
||||
global.gc();
|
||||
console.log('♻️ Memory freed after initialization\n');
|
||||
}
|
||||
|
||||
// === PHASE 2: Find duplicates field by field ===
|
||||
const allDuplicates: DuplicateGroup[] = [];
|
||||
const seenDeleteIds = new Set<string>();
|
||||
|
||||
for (const field of fields) {
|
||||
const { duplicates } = await phase2_FindDuplicatesForField(
|
||||
client,
|
||||
field,
|
||||
parsedArgs.batchSize,
|
||||
seenDeleteIds
|
||||
);
|
||||
allDuplicates.push(...duplicates);
|
||||
|
||||
// Clear field duplicates to free memory
|
||||
duplicates.length = 0;
|
||||
}
|
||||
|
||||
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
|
||||
|
||||
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`📋 Summary:`);
|
||||
console.log(` Duplicate groups found: ${allDuplicates.length}`);
|
||||
console.log(` Documents to delete: ${totalToDelete}`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
||||
|
||||
if (allDuplicates.length === 0) {
|
||||
console.log('✨ No duplicates found! Index is clean.\n');
|
||||
return;
|
||||
}
|
||||
|
||||
// Show sample of duplicates
|
||||
console.log(`📝 Sample duplicates (showing first 10):\n`);
|
||||
const samplesToShow = allDuplicates.slice(0, 10);
|
||||
for (const dup of samplesToShow) {
|
||||
const truncatedValue = dup.value.length > 50
|
||||
? dup.value.substring(0, 50) + '...'
|
||||
: dup.value;
|
||||
console.log(` Field: ${dup.field}`);
|
||||
console.log(` Value: ${truncatedValue}`);
|
||||
console.log(` Keep: ${dup.keepId}`);
|
||||
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (allDuplicates.length > 10) {
|
||||
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
|
||||
}
|
||||
|
||||
if (parsedArgs.dryRun) {
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`🔎 DRY RUN - No changes made`);
|
||||
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
// === PHASE 3: Execute deletion in batches ===
|
||||
console.log(`\n🗑️ Phase 3: Removing ${totalToDelete} duplicate documents...\n`);
|
||||
|
||||
let totalDeleted = 0;
|
||||
let totalErrors = 0;
|
||||
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
|
||||
|
||||
// Clear allDuplicates to free memory
|
||||
allDuplicates.length = 0;
|
||||
|
||||
// Delete in batches with memory management
|
||||
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
|
||||
const { deleted, errors } = await phase3_DeleteBatch(
|
||||
client,
|
||||
deleteIds,
|
||||
parsedArgs.batchSize,
|
||||
i
|
||||
);
|
||||
|
||||
totalDeleted += deleted;
|
||||
totalErrors += errors;
|
||||
|
||||
process.stdout.write(
|
||||
`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - ` +
|
||||
`Deleted: ${totalDeleted}, Errors: ${totalErrors}`
|
||||
);
|
||||
}
|
||||
|
||||
// Clear deleteIds to free memory
|
||||
deleteIds.length = 0;
|
||||
seenDeleteIds.clear();
|
||||
|
||||
// === PHASE 4: Finalize ===
|
||||
await phase4_Finalize(client, totalDeleted, totalErrors, totalDocuments);
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2);
|
||||
const parsedArgs = parseArgs(args);
|
||||
|
||||
if (parsedArgs.showHelp) {
|
||||
showHelp();
|
||||
}
|
||||
|
||||
// Validate field if provided
|
||||
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
||||
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
|
||||
console.error(`❌ Invalid field: ${parsedArgs.field}`);
|
||||
console.error(` Valid fields: ${validFields.join(', ')}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\n🔧 Configuration:`);
|
||||
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
|
||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
||||
if (parsedArgs.field) {
|
||||
console.log(` Field: ${parsedArgs.field}`);
|
||||
} else {
|
||||
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
|
||||
}
|
||||
console.log('');
|
||||
|
||||
removeDuplicates(parsedArgs).catch(console.error);
|
||||
Referencia en una nueva incidencia
Block a user