Comparar commits

..

10 Commits

Autor SHA1 Mensaje Fecha
ale
b91d19dc0b fix memory remove dup
Signed-off-by: ale <ale@manalejandro.com>
2025-12-21 22:36:31 +01:00
ale
da89037125 out useRouter
Signed-off-by: ale <ale@manalejandro.com>
2025-12-11 00:46:59 +01:00
ale
20f0503134 fix share link
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 23:11:25 +01:00
ale
42bc5a15d0 sanitize nosql
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 23:08:38 +01:00
ale
2de78b7461 share link
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 23:08:24 +01:00
ale
8fa586731a out bcrypt
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 21:06:35 +01:00
ale
ad7a1cf0a7 scroll api
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 20:58:02 +01:00
ale
459cdcd9bc remove-duplicates
Signed-off-by: ale <ale@manalejandro.com>
2025-12-08 20:56:04 +01:00
ale
9c0c30e846 show stats
Signed-off-by: ale <ale@manalejandro.com>
2025-12-07 01:30:51 +01:00
ale
179e192e82 script --no-check
Signed-off-by: ale <ale@manalejandro.com>
2025-12-07 01:28:37 +01:00
Se han modificado 14 ficheros con 810 adiciones y 146 borrados

1
API.md
Ver fichero

@@ -179,7 +179,6 @@ The API automatically detects hash types based on length and format:
| SHA1 | 40 | `^[a-f0-9]{40}$` |
| SHA256 | 64 | `^[a-f0-9]{64}$` |
| SHA512 | 128 | `^[a-f0-9]{128}$` |
| Bcrypt | 60 | `^\$2[abxy]\$` |
Hashes are case-insensitive.

Ver fichero

@@ -10,12 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
#### Core Features
- Hash search functionality for MD5, SHA1, SHA256, SHA512, and Bcrypt
- Hash search functionality for MD5, SHA1, SHA256, and SHA512
- Hash generation from plaintext input
- Automatic detection of hash types based on length and pattern
- Real-time hash generation with instant results
- Copy to clipboard functionality for all hash values
- Bcrypt verification support
#### Backend
- Elasticsearch integration with configurable endpoint

Ver fichero

@@ -13,7 +13,7 @@
## ✨ Key Features
### 🔍 Hash Search
- Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
- Search for MD5, SHA1, SHA256, and SHA512 hashes
- Automatic hash type detection
- Case-insensitive matching
- Real-time results
@@ -174,7 +174,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
| SHA1 | 40 | `^[a-f0-9]{40}$` |
| SHA256 | 64 | `^[a-f0-9]{64}$` |
| SHA512 | 128 | `^[a-f0-9]{128}$` |
| Bcrypt | 60 | `^\$2[abxy]\$` |
---
@@ -245,7 +244,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
## 📈 Future Enhancements
### Planned Features
- Bcrypt hash validation
- Argon2 hash support
- Search history
- Batch lookup

Ver fichero

@@ -25,7 +25,6 @@ npm run index-file -- --help # Show help
| SHA1 | 40 | `5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8` |
| SHA256 | 64 | `5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8` |
| SHA512 | 128 | `b109f3bbbc244eb82441917ed06d618b9008dd09b3befd1b5e07394c706a8bb9...` |
| Bcrypt | 60 | `$2b$10$N9qo8uLOickgx2ZMRZoMye...` |
## 🔌 API Quick Reference

Ver fichero

@@ -8,7 +8,7 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
## ✨ Features
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, and SHA512 hashes
- 🔑 **Hash Generation**: Generate multiple hash types from plaintext
- 💾 **Auto-Indexing**: Automatically stores searched plaintext and hashes
- 📊 **Elasticsearch Backend**: Scalable storage with 10 shards for performance
@@ -274,7 +274,6 @@ npm run lint
| SHA1 | 40 | `^[a-f0-9]{40}$` |
| SHA256 | 64 | `^[a-f0-9]{64}$` |
| SHA512 | 128 | `^[a-f0-9]{128}$` |
| Bcrypt | 60 | `^\$2[abxy]\$` |
## 🚀 Performance

Ver fichero

@@ -8,17 +8,104 @@ interface HashDocument {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
created_at?: string;
}
// Maximum allowed query length
const MAX_QUERY_LENGTH = 1000;
// Characters that could be used in NoSQL/Elasticsearch injection attacks
const DANGEROUS_PATTERNS = [
/[{}\[\]]/g, // JSON structure characters
/\$[a-zA-Z]/g, // MongoDB-style operators
/\\u[0-9a-fA-F]{4}/g, // Unicode escapes
/<script/gi, // XSS attempts
/javascript:/gi, // XSS attempts
];
/**
* Sanitize input to prevent NoSQL injection attacks
* For hash lookups, we only need alphanumeric characters and $
* For plaintext, we allow more characters but sanitize dangerous patterns
*/
function sanitizeInput(input: string): string {
// Trim and take first word only
let sanitized = input.trim().split(/\s+/)[0] || '';
// Limit length
if (sanitized.length > MAX_QUERY_LENGTH) {
sanitized = sanitized.substring(0, MAX_QUERY_LENGTH);
}
// Remove null bytes
sanitized = sanitized.replace(/\0/g, '');
// Check for dangerous patterns
for (const pattern of DANGEROUS_PATTERNS) {
sanitized = sanitized.replace(pattern, '');
}
return sanitized;
}
/**
* Validate that the input is safe for use in Elasticsearch queries
*/
function isValidInput(input: string): boolean {
// Check for empty input
if (!input || input.length === 0) {
return false;
}
// Check for excessively long input
if (input.length > MAX_QUERY_LENGTH) {
return false;
}
// Check for control characters (except normal whitespace)
if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(input)) {
return false;
}
return true;
}
export async function POST(request: NextRequest) {
try {
const { query } = await request.json();
const body = await request.json();
// Validate request body structure
if (!body || typeof body !== 'object') {
return NextResponse.json(
{ error: 'Invalid request body' },
{ status: 400 }
);
}
const { query } = body;
// Validate query type
if (!query || typeof query !== 'string') {
return NextResponse.json(
{ error: 'Query parameter is required' },
{ error: 'Query parameter is required and must be a string' },
{ status: 400 }
);
}
// Validate input before processing
if (!isValidInput(query)) {
return NextResponse.json(
{ error: 'Invalid query: contains forbidden characters or is too long' },
{ status: 400 }
);
}
// Sanitize input
const cleanQuery = sanitizeInput(query);
if (!cleanQuery) {
return NextResponse.json(
{ error: 'Invalid query: only whitespace or invalid characters provided' },
{ status: 400 }
);
}
@@ -26,15 +113,6 @@ export async function POST(request: NextRequest) {
// Ensure index exists
await initializeIndex();
const cleanQuery = query.trim().split(/\s+/)[0];
if (!cleanQuery) {
return NextResponse.json(
{ error: 'Invalid query: only whitespace provided' },
{ status: 400 }
);
}
const cleanQueryLower = cleanQuery.toLowerCase();
const hashType = detectHashType(cleanQueryLower);
@@ -44,7 +122,7 @@ export async function POST(request: NextRequest) {
index: INDEX_NAME,
query: {
term: {
[hashType]: hashType === 'bcrypt' ? cleanQuery : cleanQueryLower
[hashType]: cleanQueryLower
}
}
});
@@ -66,7 +144,6 @@ export async function POST(request: NextRequest) {
sha1: source.sha1,
sha256: source.sha256,
sha512: source.sha512,
bcrypt: source.bcrypt,
}
};
})
@@ -101,11 +178,10 @@ export async function POST(request: NextRequest) {
sha1: existingDoc.sha1,
sha256: existingDoc.sha256,
sha512: existingDoc.sha512,
bcrypt: existingDoc.bcrypt,
};
} else {
// Plaintext not found, generate hashes and check if any hash already exists
hashes = await generateHashes(cleanQuery);
hashes = generateHashes(cleanQuery);
const hashExistsResponse = await esClient.search<HashDocument>({
index: INDEX_NAME,
@@ -147,7 +223,6 @@ export async function POST(request: NextRequest) {
sha1: hashes.sha1,
sha256: hashes.sha256,
sha512: hashes.sha512,
bcrypt: hashes.bcrypt,
}
});
}

Ver fichero

@@ -14,8 +14,8 @@ const geistMono = Geist_Mono({
export const metadata: Metadata = {
title: "Hasher - Hash Search & Generator",
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt. Powered by Elasticsearch.",
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "bcrypt", "hash generator", "hash search", "elasticsearch"],
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512. Powered by Elasticsearch.",
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "hash generator", "hash search", "elasticsearch"],
authors: [{ name: "Hasher" }],
creator: "Hasher",
publisher: "Hasher",
@@ -28,7 +28,7 @@ export const metadata: Metadata = {
openGraph: {
type: "website",
title: "Hasher - Hash Search & Generator",
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
siteName: "Hasher",
images: [
{
@@ -42,7 +42,7 @@ export const metadata: Metadata = {
twitter: {
card: "summary",
title: "Hasher - Hash Search & Generator",
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
images: ["/logo.png"],
},
viewport: {

Ver fichero

@@ -1,7 +1,8 @@
'use client';
import { useState } from 'react';
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2 } from 'lucide-react';
import { useState, useEffect, useCallback, Suspense } from 'react';
import { useSearchParams } from 'next/navigation';
import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2, Database, Link } from 'lucide-react';
interface SearchResult {
found: boolean;
@@ -15,7 +16,6 @@ interface SearchResult {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
};
results?: Array<{
plaintext: string;
@@ -24,22 +24,41 @@ interface SearchResult {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
};
}>;
message?: string;
}
export default function Home() {
interface IndexStats {
documentCount: number;
indexSize: number;
}
function formatBytes(bytes: number): string {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
function formatNumber(num: number): string {
return num.toLocaleString();
}
function HasherContent() {
const searchParams = useSearchParams();
const [query, setQuery] = useState('');
const [result, setResult] = useState<SearchResult | null>(null);
const [loading, setLoading] = useState(false);
const [error, setError] = useState('');
const [copiedField, setCopiedField] = useState<string | null>(null);
const [stats, setStats] = useState<IndexStats | null>(null);
const [copiedLink, setCopiedLink] = useState(false);
const [initialLoadDone, setInitialLoadDone] = useState(false);
const handleSearch = async (e: React.FormEvent) => {
e.preventDefault();
if (!query.trim()) return;
const performSearch = useCallback(async (searchQuery: string, updateUrl: boolean = true) => {
if (!searchQuery.trim()) return;
setLoading(true);
setError('');
@@ -49,7 +68,7 @@ export default function Home() {
const response = await fetch('/api/search', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ query: query.trim() })
body: JSON.stringify({ query: searchQuery.trim() })
});
if (!response.ok) {
@@ -58,11 +77,53 @@ export default function Home() {
const data = await response.json();
setResult(data);
// Update URL with search query (using history API to avoid re-triggering effects)
if (updateUrl) {
const newUrl = new URL(window.location.href);
newUrl.searchParams.set('q', searchQuery.trim());
window.history.replaceState(null, '', newUrl.pathname + newUrl.search);
}
} catch (_err) {
setError('Failed to perform search. Please check your connection.');
} finally {
setLoading(false);
}
}, []);
// Load query from URL on mount (only once)
useEffect(() => {
if (initialLoadDone) return;
const urlQuery = searchParams.get('q');
if (urlQuery) {
setQuery(urlQuery);
performSearch(urlQuery, false);
}
setInitialLoadDone(true);
}, [searchParams, performSearch, initialLoadDone]);
useEffect(() => {
const fetchStats = async () => {
try {
const response = await fetch('/api/health');
if (response.ok) {
const data = await response.json();
if (data.index?.stats) {
setStats(data.index.stats);
}
}
} catch (_err) {
// Silently fail - stats are not critical
}
};
fetchStats();
}, [result]); // Refresh stats after each search result
const handleSearch = async (e: React.FormEvent) => {
e.preventDefault();
performSearch(query);
};
const copyToClipboard = (text: string, field: string) => {
@@ -71,6 +132,14 @@ export default function Home() {
setTimeout(() => setCopiedField(null), 2000);
};
const copyShareLink = () => {
const url = new URL(window.location.href);
url.searchParams.set('q', query.trim());
navigator.clipboard.writeText(url.toString());
setCopiedLink(true);
setTimeout(() => setCopiedLink(false), 2000);
};
const HashDisplay = ({ label, value, field }: { label: string; value: string; field: string }) => (
<div className="bg-gray-50 rounded-lg p-4 border border-gray-200">
<div className="flex items-center justify-between mb-2">
@@ -108,8 +177,20 @@ export default function Home() {
Search for hashes or generate them from plaintext
</p>
<p className="text-sm text-gray-500 mt-2">
Supports MD5, SHA1, SHA256, SHA512, and Bcrypt
Supports MD5, SHA1, SHA256, and SHA512
</p>
{stats && (
<div className="flex items-center justify-center gap-4 mt-4 text-sm text-gray-500">
<div className="flex items-center gap-1.5">
<Database className="w-4 h-4" />
<span><strong>{formatNumber(stats.documentCount)}</strong> hashes</span>
</div>
<span className="text-gray-300"></span>
<div>
<span><strong>{formatBytes(stats.indexSize)}</strong> indexed</span>
</div>
</div>
)}
</div>
{/* Search Form */}
@@ -120,19 +201,35 @@ export default function Home() {
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="Enter a hash or plaintext..."
className="w-full px-6 py-4 pr-14 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
className="w-full px-6 py-4 pr-28 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
/>
<button
type="submit"
disabled={loading || !query.trim()}
className="absolute right-2 top-1/2 -translate-y-1/2 bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
>
{loading ? (
<Loader2 className="w-6 h-6 animate-spin" />
) : (
<Search className="w-6 h-6" />
<div className="absolute right-2 top-1/2 -translate-y-1/2 flex gap-1">
{query.trim() && (
<button
type="button"
onClick={copyShareLink}
className="bg-gray-100 text-gray-600 p-3 rounded-xl hover:bg-gray-200 transition-all"
title="Copy share link"
>
{copiedLink ? (
<Check className="w-6 h-6 text-green-600" />
) : (
<Link className="w-6 h-6" />
)}
</button>
)}
</button>
<button
type="submit"
disabled={loading || !query.trim()}
className="bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
>
{loading ? (
<Loader2 className="w-6 h-6 animate-spin" />
) : (
<Search className="w-6 h-6" />
)}
</button>
</div>
</div>
</form>
@@ -166,7 +263,6 @@ export default function Home() {
<HashDisplay label="SHA1" value={result.hashes!.sha1} field="sha1-gen" />
<HashDisplay label="SHA256" value={result.hashes!.sha256} field="sha256-gen" />
<HashDisplay label="SHA512" value={result.hashes!.sha512} field="sha512-gen" />
<HashDisplay label="Bcrypt" value={result.hashes!.bcrypt} field="bcrypt-gen" />
</div>
{result.wasGenerated && (
<div className="mt-6 bg-blue-50 border border-blue-200 rounded-xl p-4">
@@ -212,7 +308,6 @@ export default function Home() {
<HashDisplay label="SHA1" value={item.hashes.sha1} field={`sha1-${idx}`} />
<HashDisplay label="SHA256" value={item.hashes.sha256} field={`sha256-${idx}`} />
<HashDisplay label="SHA512" value={item.hashes.sha512} field={`sha512-${idx}`} />
<HashDisplay label="Bcrypt" value={item.hashes.bcrypt} field={`bcrypt-${idx}`} />
</div>
</div>
))}
@@ -256,7 +351,7 @@ export default function Home() {
</div>
<h3 className="text-xl font-bold text-gray-900 mb-2">Generate Hashes</h3>
<p className="text-gray-600">
Enter any plaintext to instantly generate MD5, SHA1, SHA256, SHA512, and Bcrypt hashes. Results are saved automatically.
Enter any plaintext to instantly generate MD5, SHA1, SHA256, and SHA512 hashes. Results are saved automatically.
</p>
</div>
</div>
@@ -271,3 +366,19 @@ export default function Home() {
);
}
function LoadingFallback() {
return (
<div className="min-h-screen bg-gradient-to-br from-blue-50 via-white to-purple-50 flex items-center justify-center">
<Loader2 className="w-12 h-12 text-blue-600 animate-spin" />
</div>
);
}
export default function Home() {
return (
<Suspense fallback={<LoadingFallback />}>
<HasherContent />
</Suspense>
);
}

Ver fichero

@@ -46,9 +46,6 @@ export const INDEX_MAPPING = {
sha512: {
type: 'keyword' as const
},
bcrypt: {
type: 'keyword' as const
},
created_at: {
type: 'date' as const
}

Ver fichero

@@ -1,5 +1,4 @@
import crypto from 'crypto';
import bcrypt from 'bcrypt';
export interface HashResult {
plaintext: string;
@@ -7,22 +6,18 @@ export interface HashResult {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
}
/**
* Generate all common hashes for a given plaintext
*/
export async function generateHashes(plaintext: string): Promise<HashResult> {
const bcryptHash = await bcrypt.hash(plaintext, 10);
export function generateHashes(plaintext: string): HashResult {
return {
plaintext,
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
bcrypt: bcryptHash,
};
}
@@ -52,11 +47,6 @@ export function detectHashType(hash: string): string | null {
return 'sha512';
}
// BCrypt: starts with $2a$, $2b$, $2x$, or $2y$
if (/^\$2[abxy]\$/.test(cleanHash)) {
return 'bcrypt';
}
return null;
}
@@ -66,14 +56,3 @@ export function detectHashType(hash: string): string | null {
export function isHash(input: string): boolean {
return detectHashType(input) !== null;
}
/**
* Verify a plaintext against a bcrypt hash
*/
export async function verifyBcrypt(plaintext: string, hash: string): Promise<boolean> {
try {
return await bcrypt.compare(plaintext, hash);
} catch (_error) {
return false;
}
}

Ver fichero

@@ -34,12 +34,11 @@
"build": "next build",
"start": "next start",
"lint": "eslint",
"index-file": "tsx scripts/index-file.ts"
"index-file": "tsx scripts/index-file.ts",
"remove-duplicates": "tsx scripts/remove-duplicates.ts"
},
"dependencies": {
"@elastic/elasticsearch": "^9.2.0",
"@types/bcrypt": "^6.0.0",
"bcrypt": "^6.0.0",
"lucide-react": "^0.555.0",
"next": "15.4.8",
"react": "19.1.2",

Ver fichero

@@ -1,7 +1,7 @@
{
"name": "Hasher - Hash Search & Generator",
"short_name": "Hasher",
"description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
"description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
"start_url": "/",
"display": "standalone",
"background_color": "#ffffff",

Ver fichero

@@ -14,6 +14,7 @@
* --batch-size=<number> Number of items to process in each batch (default: 100)
* --resume Resume from last saved state (default: true)
* --no-resume Start from beginning, ignore saved state
* --no-check Skip duplicate checking (faster, but may create duplicates)
* --state-file=<path> Custom state file path (default: .indexer-state-<filename>.json)
* --help, -h Show this help message
*/
@@ -34,7 +35,6 @@ interface HashDocument {
sha1: string;
sha256: string;
sha512: string;
bcrypt: string;
created_at: string;
}
@@ -54,6 +54,7 @@ interface ParsedArgs {
filePath: string | null;
batchSize: number;
resume: boolean;
checkDuplicates: boolean;
stateFile: string | null;
showHelp: boolean;
}
@@ -63,6 +64,7 @@ function parseArgs(args: string[]): ParsedArgs {
filePath: null,
batchSize: DEFAULT_BATCH_SIZE,
resume: true,
checkDuplicates: true,
stateFile: null,
showHelp: false
};
@@ -76,6 +78,8 @@ function parseArgs(args: string[]): ParsedArgs {
result.resume = true;
} else if (arg === '--no-resume') {
result.resume = false;
} else if (arg === '--no-check') {
result.checkDuplicates = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
@@ -152,17 +156,13 @@ function deleteState(stateFile: string): void {
}
}
async function generateHashes(plaintext: string): Promise<HashDocument> {
const bcrypt = await import('bcrypt');
const bcryptHash = await bcrypt.default.hash(plaintext, 10);
function generateHashes(plaintext: string): HashDocument {
return {
plaintext,
md5: crypto.createHash('md5').update(plaintext).digest('hex'),
sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
bcrypt: bcryptHash,
created_at: new Date().toISOString()
};
}
@@ -180,6 +180,7 @@ Options:
--batch-size <number> Alternative syntax for batch size
--resume Resume from last saved state (default)
--no-resume Start from beginning, ignore saved state
--no-check Skip duplicate checking (faster, but may create duplicates)
--state-file=<path> Custom state file path
--help, -h Show this help message
@@ -191,17 +192,23 @@ Examples:
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
npx tsx scripts/index-file.ts wordlist.txt --no-resume
npm run index-file -- wordlist.txt --batch-size=500
npx tsx scripts/index-file.ts wordlist.txt --no-check
npm run index-file -- wordlist.txt --batch-size=500 --no-check
State Management:
The script automatically saves progress to a state file. If interrupted,
it will resume from where it left off on the next run. Use --no-resume
to start fresh.
Duplicate Checking:
By default, the script checks if each plaintext or hash already exists
in the index before inserting. Use --no-check to skip this verification
for faster indexing (useful when you're sure there are no duplicates).
`);
process.exit(0);
}
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
const client = new Client({ node: ELASTICSEARCH_NODE });
const absolutePath = resolve(filePath);
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
@@ -244,6 +251,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
console.log(`Index: ${INDEX_NAME}`);
console.log(`File: ${filePath}`);
console.log(`Batch size: ${batchSize}`);
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
console.log(`State file: ${stateFile}`);
if (resumingFrom > 0) {
console.log(`Resuming from: line ${resumingFrom}`);
@@ -300,65 +308,69 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
const bulkOperations: any[] = [];
// Generate hashes for all items in batch first
const batchWithHashes = await Promise.all(
batch.map(async (plaintext: string) => ({
plaintext,
hashes: await generateHashes(plaintext)
}))
);
const batchWithHashes = batch.map((plaintext: string) => ({
plaintext,
hashes: generateHashes(plaintext)
}));
// Check which items already exist (by plaintext or any hash)
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
if (checkDuplicates) {
// Check which items already exist (by plaintext or any hash)
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
const existingCheck = await client.search({
index: INDEX_NAME,
size: batchSize * 5,
query: {
bool: {
should: [
{ terms: { 'plaintext.keyword': batch } },
{ terms: { md5: md5List } },
{ terms: { sha1: sha1List } },
{ terms: { sha256: sha256List } },
{ terms: { sha512: sha512List } },
],
minimum_should_match: 1
const existingCheck = await client.search({
index: INDEX_NAME,
size: batchSize * 5,
query: {
bool: {
should: [
{ terms: { 'plaintext.keyword': batch } },
{ terms: { md5: md5List } },
{ terms: { sha1: sha1List } },
{ terms: { sha256: sha256List } },
{ terms: { sha512: sha512List } },
],
minimum_should_match: 1
}
},
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
});
// Create a set of existing hashes for quick lookup
const existingHashes = new Set<string>();
existingCheck.hits.hits.forEach((hit: any) => {
const src = hit._source;
existingHashes.add(src.plaintext);
existingHashes.add(src.md5);
existingHashes.add(src.sha1);
existingHashes.add(src.sha256);
existingHashes.add(src.sha512);
});
// Prepare bulk operations only for items that don't have any duplicate hash
for (const item of batchWithHashes) {
const isDuplicate =
existingHashes.has(item.plaintext) ||
existingHashes.has(item.hashes.md5) ||
existingHashes.has(item.hashes.sha1) ||
existingHashes.has(item.hashes.sha256) ||
existingHashes.has(item.hashes.sha512);
if (!isDuplicate) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
} else {
state.skipped++;
sessionSkipped++;
}
},
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
});
// Create a set of existing hashes for quick lookup
const existingHashes = new Set<string>();
existingCheck.hits.hits.forEach((hit: any) => {
const src = hit._source;
existingHashes.add(src.plaintext);
existingHashes.add(src.md5);
existingHashes.add(src.sha1);
existingHashes.add(src.sha256);
existingHashes.add(src.sha512);
});
// Prepare bulk operations only for items that don't have any duplicate hash
let batchSkipped = 0;
for (const item of batchWithHashes) {
const isDuplicate =
existingHashes.has(item.plaintext) ||
existingHashes.has(item.hashes.md5) ||
existingHashes.has(item.hashes.sha1) ||
existingHashes.has(item.hashes.sha256) ||
existingHashes.has(item.hashes.sha512);
if (!isDuplicate) {
}
} else {
// No duplicate checking - index everything
for (const item of batchWithHashes) {
bulkOperations.push({ index: { _index: INDEX_NAME } });
bulkOperations.push(item.hashes);
} else {
batchSkipped++;
state.skipped++;
sessionSkipped++;
}
}
@@ -498,9 +510,10 @@ console.log(`\n🔧 Configuration:`);
console.log(` File: ${filePath}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
console.log(` Resume: ${parsedArgs.resume}`);
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
if (parsedArgs.stateFile) {
console.log(` State file: ${parsedArgs.stateFile}`);
}
console.log('');
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);

496
scripts/remove-duplicates.ts Archivo normal
Ver fichero

@@ -0,0 +1,496 @@
#!/usr/bin/env node
/**
* Hasher Duplicate Remover Script
*
* This script finds and removes duplicate entries from the Elasticsearch index.
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
*
* Usage:
* npx tsx scripts/remove-duplicates.ts [options]
* npm run remove-duplicates [-- options]
*
* Options:
* --dry-run Show duplicates without removing them (default)
* --execute Actually remove the duplicates
* --batch-size=<number> Number of items to process in each batch (default: 1000)
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
* --help, -h Show this help message
*/
import { Client } from '@elastic/elasticsearch';
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
const INDEX_NAME = 'hasher';
const DEFAULT_BATCH_SIZE = 1000;
interface ParsedArgs {
dryRun: boolean;
batchSize: number;
field: string | null;
showHelp: boolean;
}
interface DuplicateGroup {
value: string;
field: string;
documentIds: string[];
keepId: string;
deleteIds: string[];
}
function parseArgs(args: string[]): ParsedArgs {
const result: ParsedArgs = {
dryRun: true,
batchSize: DEFAULT_BATCH_SIZE,
field: null,
showHelp: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
if (arg === '--help' || arg === '-h') {
result.showHelp = true;
} else if (arg === '--dry-run') {
result.dryRun = true;
} else if (arg === '--execute') {
result.dryRun = false;
} else if (arg.startsWith('--batch-size=')) {
const value = arg.split('=')[1];
const parsed = parseInt(value, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
}
} else if (arg === '--batch-size') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
const parsed = parseInt(nextArg, 10);
if (!isNaN(parsed) && parsed > 0) {
result.batchSize = parsed;
i++;
}
}
} else if (arg.startsWith('--field=')) {
result.field = arg.split('=')[1];
} else if (arg === '--field') {
const nextArg = args[i + 1];
if (nextArg && !nextArg.startsWith('-')) {
result.field = nextArg;
i++;
}
}
}
return result;
}
function showHelp() {
console.log(`
Hasher Duplicate Remover Script
Usage:
npx tsx scripts/remove-duplicates.ts [options]
npm run remove-duplicates [-- options]
Options:
--dry-run Show duplicates without removing them (default)
--execute Actually remove the duplicates
--batch-size=<number> Number of items to process in each batch (default: 1000)
--field=<field> Check duplicates only on this field
Valid fields: plaintext, md5, sha1, sha256, sha512
--help, -h Show this help message
Environment Variables:
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
Examples:
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
Notes:
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
- Always run with --dry-run first to review what will be deleted
- Duplicates are checked across all hash fields by default
`);
process.exit(0);
}
async function findDuplicatesForField(
client: Client,
field: string,
batchSize: number
): Promise<DuplicateGroup[]> {
const duplicates: DuplicateGroup[] = [];
// Use aggregation to find duplicate values
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
// Use composite aggregation to handle large number of duplicates
let afterKey: any = undefined;
let hasMore = true;
console.log(` Scanning for duplicates...`);
while (hasMore) {
const aggQuery: any = {
index: INDEX_NAME,
size: 0,
aggs: {
duplicates: {
composite: {
size: batchSize,
sources: [
{ value: { terms: { field: fieldToAggregate } } }
],
...(afterKey && { after: afterKey })
},
aggs: {
doc_count_filter: {
bucket_selector: {
buckets_path: { count: '_count' },
script: 'params.count > 1'
}
}
}
}
}
};
const response = await client.search(aggQuery);
const compositeAgg = response.aggregations?.duplicates as any;
const buckets = compositeAgg?.buckets || [];
for (const bucket of buckets) {
if (bucket.doc_count > 1) {
const value = bucket.key.value;
// Use scroll API for large result sets
const documentIds: string[] = [];
let scrollResponse = await client.search({
index: INDEX_NAME,
scroll: '1m',
size: 1000,
query: {
term: {
[fieldToAggregate]: value
}
},
sort: [
{ created_at: { order: 'asc' } }
],
_source: false
});
while (scrollResponse.hits.hits.length > 0) {
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
if (!scrollResponse._scroll_id) break;
scrollResponse = await client.scroll({
scroll_id: scrollResponse._scroll_id,
scroll: '1m'
});
}
// Clear scroll
if (scrollResponse._scroll_id) {
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
}
if (documentIds.length > 1) {
duplicates.push({
value: String(value),
field,
documentIds,
keepId: documentIds[0], // Keep the oldest
deleteIds: documentIds.slice(1) // Delete the rest
});
}
}
}
// Check if there are more results
afterKey = compositeAgg?.after_key;
hasMore = buckets.length === batchSize && afterKey;
if (hasMore) {
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
}
}
return duplicates;
}
/**
* Phase 1: Initialize and connect to Elasticsearch
*/
async function phase1_InitAndConnect() {
console.log(`🔍 Hasher Duplicate Remover - Phase 1: Initialization`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
console.log(`Index: ${INDEX_NAME}`);
console.log('');
const client = new Client({ node: ELASTICSEARCH_NODE });
console.log('🔗 Connecting to Elasticsearch...');
await client.cluster.health({});
console.log('✅ Connected successfully\n');
const countResponse = await client.count({ index: INDEX_NAME });
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
return { client, totalDocuments: countResponse.count };
}
/**
* Phase 2: Find duplicates for a specific field
*/
async function phase2_FindDuplicatesForField(
client: Client,
field: string,
batchSize: number,
seenDeleteIds: Set<string>
): Promise<{ duplicates: DuplicateGroup[], totalFound: number }> {
console.log(`\n🔍 Phase 2: Checking duplicates for field: ${field}...`);
const fieldDuplicates = await findDuplicatesForField(client, field, batchSize);
const duplicates: DuplicateGroup[] = [];
// Filter out already seen delete IDs to avoid counting the same document multiple times
for (const dup of fieldDuplicates) {
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
if (newDeleteIds.length > 0) {
dup.deleteIds = newDeleteIds;
newDeleteIds.forEach(id => seenDeleteIds.add(id));
duplicates.push(dup);
}
}
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
console.log(` New unique documents to delete: ${duplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0)}`);
// Force garbage collection if available
if (global.gc) {
global.gc();
console.log(` ♻️ Memory freed after processing ${field}`);
}
return { duplicates, totalFound: fieldDuplicates.length };
}
/**
* Phase 3: Process deletion for a batch of duplicates
*/
async function phase3_DeleteBatch(
client: Client,
deleteIds: string[],
batchSize: number,
startIndex: number
): Promise<{ deleted: number, errors: number }> {
const batch = deleteIds.slice(startIndex, startIndex + batchSize);
let deleted = 0;
let errors = 0;
try {
const bulkOperations = batch.flatMap(id => [
{ delete: { _index: INDEX_NAME, _id: id } }
]);
const bulkResponse = await client.bulk({
operations: bulkOperations,
refresh: false
});
if (bulkResponse.errors) {
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
errors += errorCount;
deleted += batch.length - errorCount;
} else {
deleted += batch.length;
}
} catch (error) {
console.error(`\n❌ Error deleting batch:`, error);
errors += batch.length;
}
// Force garbage collection if available
if (global.gc) {
global.gc();
}
return { deleted, errors };
}
/**
* Phase 4: Finalize and report results
*/
async function phase4_Finalize(
client: Client,
totalDeleted: number,
totalErrors: number,
initialDocumentCount: number
) {
console.log('\n\n🔄 Phase 4: Refreshing index...');
await client.indices.refresh({ index: INDEX_NAME });
const newCountResponse = await client.count({ index: INDEX_NAME });
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ Duplicate removal complete!');
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Documents deleted: ${totalDeleted}`);
console.log(`Errors: ${totalErrors}`);
console.log(`Previous document count: ${initialDocumentCount}`);
console.log(`New document count: ${newCountResponse.count}`);
console.log('');
}
async function removeDuplicates(parsedArgs: ParsedArgs) {
const fields = parsedArgs.field
? [parsedArgs.field]
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
console.log(`Batch size: ${parsedArgs.batchSize}`);
console.log(`Fields to check: ${fields.join(', ')}`);
console.log('');
try {
// === PHASE 1: Initialize ===
const { client, totalDocuments } = await phase1_InitAndConnect();
// Force garbage collection after phase 1
if (global.gc) {
global.gc();
console.log('♻️ Memory freed after initialization\n');
}
// === PHASE 2: Find duplicates field by field ===
const allDuplicates: DuplicateGroup[] = [];
const seenDeleteIds = new Set<string>();
for (const field of fields) {
const { duplicates } = await phase2_FindDuplicatesForField(
client,
field,
parsedArgs.batchSize,
seenDeleteIds
);
allDuplicates.push(...duplicates);
// Clear field duplicates to free memory
duplicates.length = 0;
}
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`📋 Summary:`);
console.log(` Duplicate groups found: ${allDuplicates.length}`);
console.log(` Documents to delete: ${totalToDelete}`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
if (allDuplicates.length === 0) {
console.log('✨ No duplicates found! Index is clean.\n');
return;
}
// Show sample of duplicates
console.log(`📝 Sample duplicates (showing first 10):\n`);
const samplesToShow = allDuplicates.slice(0, 10);
for (const dup of samplesToShow) {
const truncatedValue = dup.value.length > 50
? dup.value.substring(0, 50) + '...'
: dup.value;
console.log(` Field: ${dup.field}`);
console.log(` Value: ${truncatedValue}`);
console.log(` Keep: ${dup.keepId}`);
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
console.log('');
}
if (allDuplicates.length > 10) {
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
}
if (parsedArgs.dryRun) {
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`🔎 DRY RUN - No changes made`);
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
return;
}
// === PHASE 3: Execute deletion in batches ===
console.log(`\n🗑 Phase 3: Removing ${totalToDelete} duplicate documents...\n`);
let totalDeleted = 0;
let totalErrors = 0;
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
// Clear allDuplicates to free memory
allDuplicates.length = 0;
// Delete in batches with memory management
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
const { deleted, errors } = await phase3_DeleteBatch(
client,
deleteIds,
parsedArgs.batchSize,
i
);
totalDeleted += deleted;
totalErrors += errors;
process.stdout.write(
`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - ` +
`Deleted: ${totalDeleted}, Errors: ${totalErrors}`
);
}
// Clear deleteIds to free memory
deleteIds.length = 0;
seenDeleteIds.clear();
// === PHASE 4: Finalize ===
await phase4_Finalize(client, totalDeleted, totalErrors, totalDocuments);
} catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const parsedArgs = parseArgs(args);
if (parsedArgs.showHelp) {
showHelp();
}
// Validate field if provided
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
console.error(`❌ Invalid field: ${parsedArgs.field}`);
console.error(` Valid fields: ${validFields.join(', ')}`);
process.exit(1);
}
console.log(`\n🔧 Configuration:`);
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
console.log(` Batch size: ${parsedArgs.batchSize}`);
if (parsedArgs.field) {
console.log(` Field: ${parsedArgs.field}`);
} else {
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
}
console.log('');
removeDuplicates(parsedArgs).catch(console.error);