fix memory remove dup

Signed-off-by: ale <ale@manalejandro.com>
out useRouter
2025-12-21 22:36:31 +01:00 · 2025-12-11 00:46:59 +01:00 · 2025-12-08 23:11:25 +01:00 · 2025-12-08 23:08:38 +01:00 · 2025-12-08 23:08:24 +01:00 · 2025-12-08 21:06:35 +01:00
--- a/API.md
+++ b/API.md
@@ -179,7 +179,6 @@ The API automatically detects hash types based on length and format:
 | SHA1      | 40                | `^[a-f0-9]{40}$` |
 | SHA256    | 64                | `^[a-f0-9]{64}$` |
 | SHA512    | 128               | `^[a-f0-9]{128}$` |
-| Bcrypt    | 60                | `^\$2[abxy]\$` |

 Hashes are case-insensitive.

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,12 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added

 #### Core Features
- Hash search functionality for MD5, SHA1, SHA256, SHA512, and Bcrypt
+- Hash search functionality for MD5, SHA1, SHA256, and SHA512
 - Hash generation from plaintext input
 - Automatic detection of hash types based on length and pattern
 - Real-time hash generation with instant results
 - Copy to clipboard functionality for all hash values
- Bcrypt verification support

 #### Backend
 - Elasticsearch integration with configurable endpoint
--- a/PROJECT_SUMMARY.md
+++ b/PROJECT_SUMMARY.md
@@ -13,7 +13,7 @@
 ## ✨ Key Features

 ### 🔍 Hash Search
- Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
+- Search for MD5, SHA1, SHA256, and SHA512 hashes
 - Automatic hash type detection
 - Case-insensitive matching
 - Real-time results
@@ -174,7 +174,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
 | SHA1      | 40     | `^[a-f0-9]{40}$` |
 | SHA256    | 64     | `^[a-f0-9]{64}$` |
 | SHA512    | 128    | `^[a-f0-9]{128}$` |
-| Bcrypt    | 60     | `^\$2[abxy]\$` |

 ---

@@ -245,7 +244,6 @@ export ELASTICSEARCH_NODE=http://localhost:9200
 ## 📈 Future Enhancements

 ### Planned Features
- Bcrypt hash validation
 - Argon2 hash support
 - Search history
 - Batch lookup
--- a/QUICK_REFERENCE.md
+++ b/QUICK_REFERENCE.md
@@ -25,7 +25,6 @@ npm run index-file -- --help                 # Show help
 | SHA1   | 40     | `5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8` |
 | SHA256 | 64     | `5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8` |
 | SHA512 | 128    | `b109f3bbbc244eb82441917ed06d618b9008dd09b3befd1b5e07394c706a8bb9...` |
-| Bcrypt | 60     | `$2b$10$N9qo8uLOickgx2ZMRZoMye...` |

 ## 🔌 API Quick Reference

--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ A modern, high-performance hash search and generation tool powered by Elasticsea

 ## ✨ Features

- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, SHA512, and Bcrypt hashes
+- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, and SHA512 hashes
 - 🔑 **Hash Generation**: Generate multiple hash types from plaintext
 - 💾 **Auto-Indexing**: Automatically stores searched plaintext and hashes
 - 📊 **Elasticsearch Backend**: Scalable storage with 10 shards for performance
@@ -274,7 +274,6 @@ npm run lint
 | SHA1      | 40           | `^[a-f0-9]{40}$` |
 | SHA256    | 64           | `^[a-f0-9]{64}$` |
 | SHA512    | 128          | `^[a-f0-9]{128}$` |
-| Bcrypt    | 60           | `^\$2[abxy]\$` |

 ## 🚀 Performance

--- a/app/api/search/route.ts
+++ b/app/api/search/route.ts
@@ -8,17 +8,104 @@ interface HashDocument {
  sha1: string;
  sha256: string;
  sha512: string;
-  bcrypt: string;
  created_at?: string;
 }

+// Maximum allowed query length
+const MAX_QUERY_LENGTH = 1000;
+
+// Characters that could be used in NoSQL/Elasticsearch injection attacks
+const DANGEROUS_PATTERNS = [
+  /[{}\[\]]/g,           // JSON structure characters
+  /\$[a-zA-Z]/g,         // MongoDB-style operators
+  /\\u[0-9a-fA-F]{4}/g,  // Unicode escapes
+  /<script/gi,           // XSS attempts
+  /javascript:/gi,       // XSS attempts
+];
+
+/**
+ * Sanitize input to prevent NoSQL injection attacks
+ * For hash lookups, we only need alphanumeric characters and $
+ * For plaintext, we allow more characters but sanitize dangerous patterns
+ */
+function sanitizeInput(input: string): string {
+  // Trim and take first word only
+  let sanitized = input.trim().split(/\s+/)[0] || '';
+  
+  // Limit length
+  if (sanitized.length > MAX_QUERY_LENGTH) {
+    sanitized = sanitized.substring(0, MAX_QUERY_LENGTH);
+  }
+  
+  // Remove null bytes
+  sanitized = sanitized.replace(/\0/g, '');
+  
+  // Check for dangerous patterns
+  for (const pattern of DANGEROUS_PATTERNS) {
+    sanitized = sanitized.replace(pattern, '');
+  }
+  
+  return sanitized;
+}
+
+/**
+ * Validate that the input is safe for use in Elasticsearch queries
+ */
+function isValidInput(input: string): boolean {
+  // Check for empty input
+  if (!input || input.length === 0) {
+    return false;
+  }
+  
+  // Check for excessively long input
+  if (input.length > MAX_QUERY_LENGTH) {
+    return false;
+  }
+  
+  // Check for control characters (except normal whitespace)
+  if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(input)) {
+    return false;
+  }
+  
+  return true;
+}
+
 export async function POST(request: NextRequest) {
  try {
-    const { query } = await request.json();
+    const body = await request.json();
    
+    // Validate request body structure
+    if (!body || typeof body !== 'object') {
+      return NextResponse.json(
+        { error: 'Invalid request body' },
+        { status: 400 }
+      );
+    }
+
+    const { query } = body;
+
+    // Validate query type
    if (!query || typeof query !== 'string') {
      return NextResponse.json(
-        { error: 'Query parameter is required' },
+        { error: 'Query parameter is required and must be a string' },
+        { status: 400 }
+      );
+    }
+
+    // Validate input before processing
+    if (!isValidInput(query)) {
+      return NextResponse.json(
+        { error: 'Invalid query: contains forbidden characters or is too long' },
+        { status: 400 }
+      );
+    }
+
+    // Sanitize input
+    const cleanQuery = sanitizeInput(query);
+    
+    if (!cleanQuery) {
+      return NextResponse.json(
+        { error: 'Invalid query: only whitespace or invalid characters provided' },
        { status: 400 }
      );
    }
@@ -26,15 +113,6 @@ export async function POST(request: NextRequest) {
    // Ensure index exists
    await initializeIndex();

-    const cleanQuery = query.trim().split(/\s+/)[0];
-    
-    if (!cleanQuery) {
-      return NextResponse.json(
-        { error: 'Invalid query: only whitespace provided' },
-        { status: 400 }
-      );
-    }
-
    const cleanQueryLower = cleanQuery.toLowerCase();
    const hashType = detectHashType(cleanQueryLower);

@@ -44,7 +122,7 @@ export async function POST(request: NextRequest) {
        index: INDEX_NAME,
        query: {
          term: {
-            [hashType]: hashType === 'bcrypt' ? cleanQuery : cleanQueryLower
+            [hashType]: cleanQueryLower
          }
        }
      });
@@ -66,7 +144,6 @@ export async function POST(request: NextRequest) {
                sha1: source.sha1,
                sha256: source.sha256,
                sha512: source.sha512,
-                bcrypt: source.bcrypt,
              }
            };
          })
@@ -101,11 +178,10 @@ export async function POST(request: NextRequest) {
          sha1: existingDoc.sha1,
          sha256: existingDoc.sha256,
          sha512: existingDoc.sha512,
-          bcrypt: existingDoc.bcrypt,
        };
      } else {
        // Plaintext not found, generate hashes and check if any hash already exists
-        hashes = await generateHashes(cleanQuery);
+        hashes = generateHashes(cleanQuery);
        
        const hashExistsResponse = await esClient.search<HashDocument>({
          index: INDEX_NAME,
@@ -147,7 +223,6 @@ export async function POST(request: NextRequest) {
          sha1: hashes.sha1,
          sha256: hashes.sha256,
          sha512: hashes.sha512,
-          bcrypt: hashes.bcrypt,
        }
      });
    }
--- a/app/layout.tsx
+++ b/app/layout.tsx
@@ -14,8 +14,8 @@ const geistMono = Geist_Mono({

 export const metadata: Metadata = {
  title: "Hasher - Hash Search & Generator",
-  description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt. Powered by Elasticsearch.",
-  keywords: ["hash", "md5", "sha1", "sha256", "sha512", "bcrypt", "hash generator", "hash search", "elasticsearch"],
+  description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512. Powered by Elasticsearch.",
+  keywords: ["hash", "md5", "sha1", "sha256", "sha512", "hash generator", "hash search", "elasticsearch"],
  authors: [{ name: "Hasher" }],
  creator: "Hasher",
  publisher: "Hasher",
@@ -28,7 +28,7 @@ export const metadata: Metadata = {
  openGraph: {
    type: "website",
    title: "Hasher - Hash Search & Generator",
-    description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
+    description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
    siteName: "Hasher",
    images: [
      {
@@ -42,7 +42,7 @@ export const metadata: Metadata = {
  twitter: {
    card: "summary",
    title: "Hasher - Hash Search & Generator",
-    description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
+    description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
    images: ["/logo.png"],
  },
  viewport: {
--- a/app/page.tsx
+++ b/app/page.tsx
@@ -1,7 +1,8 @@
 'use client';

-import { useState } from 'react';
-import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2 } from 'lucide-react';
+import { useState, useEffect, useCallback, Suspense } from 'react';
+import { useSearchParams } from 'next/navigation';
+import { Search, Copy, Check, Hash, Key, AlertCircle, Loader2, Database, Link } from 'lucide-react';

 interface SearchResult {
  found: boolean;
@@ -15,7 +16,6 @@ interface SearchResult {
    sha1: string;
    sha256: string;
    sha512: string;
-    bcrypt: string;
  };
  results?: Array<{
    plaintext: string;
@@ -24,22 +24,41 @@ interface SearchResult {
      sha1: string;
      sha256: string;
      sha512: string;
-      bcrypt: string;
    };
  }>;
  message?: string;
 }

-export default function Home() {
+interface IndexStats {
+  documentCount: number;
+  indexSize: number;
+}
+
+function formatBytes(bytes: number): string {
+  if (bytes === 0) return '0 B';
+  const k = 1024;
+  const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
+  const i = Math.floor(Math.log(bytes) / Math.log(k));
+  return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+}
+
+function formatNumber(num: number): string {
+  return num.toLocaleString();
+}
+
+function HasherContent() {
+  const searchParams = useSearchParams();
  const [query, setQuery] = useState('');
  const [result, setResult] = useState<SearchResult | null>(null);
  const [loading, setLoading] = useState(false);
  const [error, setError] = useState('');
  const [copiedField, setCopiedField] = useState<string | null>(null);
+  const [stats, setStats] = useState<IndexStats | null>(null);
+  const [copiedLink, setCopiedLink] = useState(false);
+  const [initialLoadDone, setInitialLoadDone] = useState(false);

-  const handleSearch = async (e: React.FormEvent) => {
-    e.preventDefault();
-    if (!query.trim()) return;
+  const performSearch = useCallback(async (searchQuery: string, updateUrl: boolean = true) => {
+    if (!searchQuery.trim()) return;

    setLoading(true);
    setError('');
@@ -49,7 +68,7 @@ export default function Home() {
      const response = await fetch('/api/search', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ query: query.trim() })
+        body: JSON.stringify({ query: searchQuery.trim() })
      });

      if (!response.ok) {
@@ -58,11 +77,53 @@ export default function Home() {

      const data = await response.json();
      setResult(data);
+      
+      // Update URL with search query (using history API to avoid re-triggering effects)
+      if (updateUrl) {
+        const newUrl = new URL(window.location.href);
+        newUrl.searchParams.set('q', searchQuery.trim());
+        window.history.replaceState(null, '', newUrl.pathname + newUrl.search);
+      }
    } catch (_err) {
      setError('Failed to perform search. Please check your connection.');
    } finally {
      setLoading(false);
    }
+  }, []);
+
+  // Load query from URL on mount (only once)
+  useEffect(() => {
+    if (initialLoadDone) return;
+    
+    const urlQuery = searchParams.get('q');
+    if (urlQuery) {
+      setQuery(urlQuery);
+      performSearch(urlQuery, false);
+    }
+    setInitialLoadDone(true);
+  }, [searchParams, performSearch, initialLoadDone]);
+
+  useEffect(() => {
+    const fetchStats = async () => {
+      try {
+        const response = await fetch('/api/health');
+        if (response.ok) {
+          const data = await response.json();
+          if (data.index?.stats) {
+            setStats(data.index.stats);
+          }
+        }
+      } catch (_err) {
+        // Silently fail - stats are not critical
+      }
+    };
+
+    fetchStats();
+  }, [result]); // Refresh stats after each search result
+
+  const handleSearch = async (e: React.FormEvent) => {
+    e.preventDefault();
+    performSearch(query);
  };

  const copyToClipboard = (text: string, field: string) => {
@@ -71,6 +132,14 @@ export default function Home() {
    setTimeout(() => setCopiedField(null), 2000);
  };

+  const copyShareLink = () => {
+    const url = new URL(window.location.href);
+    url.searchParams.set('q', query.trim());
+    navigator.clipboard.writeText(url.toString());
+    setCopiedLink(true);
+    setTimeout(() => setCopiedLink(false), 2000);
+  };
+
  const HashDisplay = ({ label, value, field }: { label: string; value: string; field: string }) => (
    <div className="bg-gray-50 rounded-lg p-4 border border-gray-200">
      <div className="flex items-center justify-between mb-2">
@@ -108,8 +177,20 @@ export default function Home() {
            Search for hashes or generate them from plaintext
          </p>
          <p className="text-sm text-gray-500 mt-2">
-            Supports MD5, SHA1, SHA256, SHA512, and Bcrypt
+            Supports MD5, SHA1, SHA256, and SHA512
          </p>
+          {stats && (
+            <div className="flex items-center justify-center gap-4 mt-4 text-sm text-gray-500">
+              <div className="flex items-center gap-1.5">
+                <Database className="w-4 h-4" />
+                <span><strong>{formatNumber(stats.documentCount)}</strong> hashes</span>
+              </div>
+              <span className="text-gray-300">•</span>
+              <div>
+                <span><strong>{formatBytes(stats.indexSize)}</strong> indexed</span>
+              </div>
+            </div>
+          )}
        </div>

        {/* Search Form */}
@@ -120,12 +201,27 @@ export default function Home() {
              value={query}
              onChange={(e) => setQuery(e.target.value)}
              placeholder="Enter a hash or plaintext..."
-              className="w-full px-6 py-4 pr-14 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
+              className="w-full px-6 py-4 pr-28 text-lg rounded-2xl border-2 border-gray-200 focus:border-blue-500 focus:ring-4 focus:ring-blue-100 outline-none transition-all shadow-sm"
            />
+            <div className="absolute right-2 top-1/2 -translate-y-1/2 flex gap-1">
+              {query.trim() && (
+                <button
+                  type="button"
+                  onClick={copyShareLink}
+                  className="bg-gray-100 text-gray-600 p-3 rounded-xl hover:bg-gray-200 transition-all"
+                  title="Copy share link"
+                >
+                  {copiedLink ? (
+                    <Check className="w-6 h-6 text-green-600" />
+                  ) : (
+                    <Link className="w-6 h-6" />
+                  )}
+                </button>
+              )}
              <button
                type="submit"
                disabled={loading || !query.trim()}
-              className="absolute right-2 top-1/2 -translate-y-1/2 bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
+                className="bg-gradient-to-r from-blue-600 to-purple-600 text-white p-3 rounded-xl hover:shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
              >
                {loading ? (
                  <Loader2 className="w-6 h-6 animate-spin" />
@@ -134,6 +230,7 @@ export default function Home() {
                )}
              </button>
            </div>
+          </div>
        </form>

        {/* Error Message */}
@@ -166,7 +263,6 @@ export default function Home() {
                  <HashDisplay label="SHA1" value={result.hashes!.sha1} field="sha1-gen" />
                  <HashDisplay label="SHA256" value={result.hashes!.sha256} field="sha256-gen" />
                  <HashDisplay label="SHA512" value={result.hashes!.sha512} field="sha512-gen" />
-                  <HashDisplay label="Bcrypt" value={result.hashes!.bcrypt} field="bcrypt-gen" />
                </div>
                {result.wasGenerated && (
                  <div className="mt-6 bg-blue-50 border border-blue-200 rounded-xl p-4">
@@ -212,7 +308,6 @@ export default function Home() {
                      <HashDisplay label="SHA1" value={item.hashes.sha1} field={`sha1-${idx}`} />
                      <HashDisplay label="SHA256" value={item.hashes.sha256} field={`sha256-${idx}`} />
                      <HashDisplay label="SHA512" value={item.hashes.sha512} field={`sha512-${idx}`} />
-                      <HashDisplay label="Bcrypt" value={item.hashes.bcrypt} field={`bcrypt-${idx}`} />
                    </div>
                  </div>
                ))}
@@ -256,7 +351,7 @@ export default function Home() {
              </div>
              <h3 className="text-xl font-bold text-gray-900 mb-2">Generate Hashes</h3>
              <p className="text-gray-600">
-                Enter any plaintext to instantly generate MD5, SHA1, SHA256, SHA512, and Bcrypt hashes. Results are saved automatically.
+                Enter any plaintext to instantly generate MD5, SHA1, SHA256, and SHA512 hashes. Results are saved automatically.
              </p>
            </div>
          </div>
@@ -271,3 +366,19 @@ export default function Home() {
  );
 }

+function LoadingFallback() {
+  return (
+    <div className="min-h-screen bg-gradient-to-br from-blue-50 via-white to-purple-50 flex items-center justify-center">
+      <Loader2 className="w-12 h-12 text-blue-600 animate-spin" />
+    </div>
+  );
+}
+
+export default function Home() {
+  return (
+    <Suspense fallback={<LoadingFallback />}>
+      <HasherContent />
+    </Suspense>
+  );
+}
+
--- a/lib/elasticsearch.ts
+++ b/lib/elasticsearch.ts
@@ -46,9 +46,6 @@ export const INDEX_MAPPING = {
      sha512: {
        type: 'keyword' as const
      },
-      bcrypt: {
-        type: 'keyword' as const
-      },
      created_at: {
        type: 'date' as const
      }
--- a/lib/hash.ts
+++ b/lib/hash.ts
@@ -1,5 +1,4 @@
 import crypto from 'crypto';
-import bcrypt from 'bcrypt';

 export interface HashResult {
  plaintext: string;
@@ -7,22 +6,18 @@ export interface HashResult {
  sha1: string;
  sha256: string;
  sha512: string;
-  bcrypt: string;
 }

 /**
 * Generate all common hashes for a given plaintext
 */
-export async function generateHashes(plaintext: string): Promise<HashResult> {
-  const bcryptHash = await bcrypt.hash(plaintext, 10);
-  
+export function generateHashes(plaintext: string): HashResult {
  return {
    plaintext,
    md5: crypto.createHash('md5').update(plaintext).digest('hex'),
    sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
    sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
    sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
-    bcrypt: bcryptHash,
  };
 }

@@ -52,11 +47,6 @@ export function detectHashType(hash: string): string | null {
    return 'sha512';
  }
  
-  // BCrypt: starts with $2a$, $2b$, $2x$, or $2y$
-  if (/^\$2[abxy]\$/.test(cleanHash)) {
-    return 'bcrypt';
-  }
-  
  return null;
 }

@@ -66,14 +56,3 @@ export function detectHashType(hash: string): string | null {
 export function isHash(input: string): boolean {
  return detectHashType(input) !== null;
 }
-
-/**
- * Verify a plaintext against a bcrypt hash
- */
-export async function verifyBcrypt(plaintext: string, hash: string): Promise<boolean> {
-  try {
-    return await bcrypt.compare(plaintext, hash);
-  } catch (_error) {
-    return false;
-  }
-}
--- a/package.json
+++ b/package.json
@@ -34,12 +34,11 @@
    "build": "next build",
    "start": "next start",
    "lint": "eslint",
-    "index-file": "tsx scripts/index-file.ts"
+    "index-file": "tsx scripts/index-file.ts",
+    "remove-duplicates": "tsx scripts/remove-duplicates.ts"
  },
  "dependencies": {
    "@elastic/elasticsearch": "^9.2.0",
-    "@types/bcrypt": "^6.0.0",
-    "bcrypt": "^6.0.0",
    "lucide-react": "^0.555.0",
    "next": "15.4.8",
    "react": "19.1.2",
--- a/public/manifest.json
+++ b/public/manifest.json
@@ -1,7 +1,7 @@
 {
  "name": "Hasher - Hash Search & Generator",
  "short_name": "Hasher",
-  "description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, SHA512, and Bcrypt.",
+  "description": "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512.",
  "start_url": "/",
  "display": "standalone",
  "background_color": "#ffffff",
--- a/scripts/index-file.ts
+++ b/scripts/index-file.ts
@@ -14,6 +14,7 @@
 *   --batch-size=<number>  Number of items to process in each batch (default: 100)
 *   --resume               Resume from last saved state (default: true)
 *   --no-resume            Start from beginning, ignore saved state
+ *   --no-check             Skip duplicate checking (faster, but may create duplicates)
 *   --state-file=<path>    Custom state file path (default: .indexer-state-<filename>.json)
 *   --help, -h             Show this help message
 */
@@ -34,7 +35,6 @@ interface HashDocument {
  sha1: string;
  sha256: string;
  sha512: string;
-  bcrypt: string;
  created_at: string;
 }

@@ -54,6 +54,7 @@ interface ParsedArgs {
  filePath: string | null;
  batchSize: number;
  resume: boolean;
+  checkDuplicates: boolean;
  stateFile: string | null;
  showHelp: boolean;
 }
@@ -63,6 +64,7 @@ function parseArgs(args: string[]): ParsedArgs {
    filePath: null,
    batchSize: DEFAULT_BATCH_SIZE,
    resume: true,
+    checkDuplicates: true,
    stateFile: null,
    showHelp: false
  };
@@ -76,6 +78,8 @@ function parseArgs(args: string[]): ParsedArgs {
      result.resume = true;
    } else if (arg === '--no-resume') {
      result.resume = false;
+    } else if (arg === '--no-check') {
+      result.checkDuplicates = false;
    } else if (arg.startsWith('--batch-size=')) {
      const value = arg.split('=')[1];
      const parsed = parseInt(value, 10);
@@ -152,17 +156,13 @@ function deleteState(stateFile: string): void {
  }
 }

-async function generateHashes(plaintext: string): Promise<HashDocument> {
-  const bcrypt = await import('bcrypt');
-  const bcryptHash = await bcrypt.default.hash(plaintext, 10);
-  
+function generateHashes(plaintext: string): HashDocument {
  return {
    plaintext,
    md5: crypto.createHash('md5').update(plaintext).digest('hex'),
    sha1: crypto.createHash('sha1').update(plaintext).digest('hex'),
    sha256: crypto.createHash('sha256').update(plaintext).digest('hex'),
    sha512: crypto.createHash('sha512').update(plaintext).digest('hex'),
-    bcrypt: bcryptHash,
    created_at: new Date().toISOString()
  };
 }
@@ -180,6 +180,7 @@ Options:
  --batch-size <number>  Alternative syntax for batch size
  --resume               Resume from last saved state (default)
  --no-resume            Start from beginning, ignore saved state
+  --no-check             Skip duplicate checking (faster, but may create duplicates)
  --state-file=<path>    Custom state file path
  --help, -h             Show this help message

@@ -191,17 +192,23 @@ Examples:
  npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
  npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
  npx tsx scripts/index-file.ts wordlist.txt --no-resume
-  npm run index-file -- wordlist.txt --batch-size=500
+  npx tsx scripts/index-file.ts wordlist.txt --no-check
+  npm run index-file -- wordlist.txt --batch-size=500 --no-check

 State Management:
  The script automatically saves progress to a state file. If interrupted,
  it will resume from where it left off on the next run. Use --no-resume
  to start fresh.
+
+Duplicate Checking:
+  By default, the script checks if each plaintext or hash already exists
+  in the index before inserting. Use --no-check to skip this verification
+  for faster indexing (useful when you're sure there are no duplicates).
 `);
  process.exit(0);
 }

-async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, customStateFile: string | null) {
+async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
  const client = new Client({ node: ELASTICSEARCH_NODE });
  const absolutePath = resolve(filePath);
  const stateFile = customStateFile || getDefaultStateFile(absolutePath);
@@ -244,6 +251,7 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
  console.log(`Index: ${INDEX_NAME}`);
  console.log(`File: ${filePath}`);
  console.log(`Batch size: ${batchSize}`);
+  console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
  console.log(`State file: ${stateFile}`);
  if (resumingFrom > 0) {
    console.log(`Resuming from: line ${resumingFrom}`);
@@ -300,13 +308,12 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
      const bulkOperations: any[] = [];

      // Generate hashes for all items in batch first
-      const batchWithHashes = await Promise.all(
-        batch.map(async (plaintext: string) => ({
+      const batchWithHashes = batch.map((plaintext: string) => ({
        plaintext,
-          hashes: await generateHashes(plaintext)
-        }))
-      );
+        hashes: generateHashes(plaintext)
+      }));

+      if (checkDuplicates) {
        // Check which items already exist (by plaintext or any hash)
        const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
        const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
@@ -343,7 +350,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
        });

        // Prepare bulk operations only for items that don't have any duplicate hash
-      let batchSkipped = 0;
        for (const item of batchWithHashes) {
          const isDuplicate = 
            existingHashes.has(item.plaintext) ||
@@ -356,11 +362,17 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
            bulkOperations.push({ index: { _index: INDEX_NAME } });
            bulkOperations.push(item.hashes);
          } else {
-          batchSkipped++;
            state.skipped++;
            sessionSkipped++;
          }
        }
+      } else {
+        // No duplicate checking - index everything
+        for (const item of batchWithHashes) {
+          bulkOperations.push({ index: { _index: INDEX_NAME } });
+          bulkOperations.push(item.hashes);
+        }
+      }

      // Execute bulk operation only if there are new items to insert
      if (bulkOperations.length > 0) {
@@ -498,9 +510,10 @@ console.log(`\n🔧 Configuration:`);
 console.log(`   File: ${filePath}`);
 console.log(`   Batch size: ${parsedArgs.batchSize}`);
 console.log(`   Resume: ${parsedArgs.resume}`);
+console.log(`   Check duplicates: ${parsedArgs.checkDuplicates}`);
 if (parsedArgs.stateFile) {
  console.log(`   State file: ${parsedArgs.stateFile}`);
 }
 console.log('');

-indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.stateFile).catch(console.error);
+indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);
--- a/scripts/remove-duplicates.ts
+++ b/scripts/remove-duplicates.ts
@@ -0,0 +1,496 @@
+#!/usr/bin/env node
+
+/**
+ * Hasher Duplicate Remover Script
+ * 
+ * This script finds and removes duplicate entries from the Elasticsearch index.
+ * It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
+ * 
+ * Usage:
+ *   npx tsx scripts/remove-duplicates.ts [options]
+ *   npm run remove-duplicates [-- options]
+ * 
+ * Options:
+ *   --dry-run              Show duplicates without removing them (default)
+ *   --execute              Actually remove the duplicates
+ *   --batch-size=<number>  Number of items to process in each batch (default: 1000)
+ *   --field=<field>        Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
+ *   --help, -h             Show this help message
+ */
+
+import { Client } from '@elastic/elasticsearch';
+
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const INDEX_NAME = 'hasher';
+const DEFAULT_BATCH_SIZE = 1000;
+
+interface ParsedArgs {
+  dryRun: boolean;
+  batchSize: number;
+  field: string | null;
+  showHelp: boolean;
+}
+
+interface DuplicateGroup {
+  value: string;
+  field: string;
+  documentIds: string[];
+  keepId: string;
+  deleteIds: string[];
+}
+
+function parseArgs(args: string[]): ParsedArgs {
+  const result: ParsedArgs = {
+    dryRun: true,
+    batchSize: DEFAULT_BATCH_SIZE,
+    field: null,
+    showHelp: false
+  };
+
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+
+    if (arg === '--help' || arg === '-h') {
+      result.showHelp = true;
+    } else if (arg === '--dry-run') {
+      result.dryRun = true;
+    } else if (arg === '--execute') {
+      result.dryRun = false;
+    } else if (arg.startsWith('--batch-size=')) {
+      const value = arg.split('=')[1];
+      const parsed = parseInt(value, 10);
+      if (!isNaN(parsed) && parsed > 0) {
+        result.batchSize = parsed;
+      }
+    } else if (arg === '--batch-size') {
+      const nextArg = args[i + 1];
+      if (nextArg && !nextArg.startsWith('-')) {
+        const parsed = parseInt(nextArg, 10);
+        if (!isNaN(parsed) && parsed > 0) {
+          result.batchSize = parsed;
+          i++;
+        }
+      }
+    } else if (arg.startsWith('--field=')) {
+      result.field = arg.split('=')[1];
+    } else if (arg === '--field') {
+      const nextArg = args[i + 1];
+      if (nextArg && !nextArg.startsWith('-')) {
+        result.field = nextArg;
+        i++;
+      }
+    }
+  }
+
+  return result;
+}
+
+function showHelp() {
+  console.log(`
+Hasher Duplicate Remover Script
+
+Usage:
+  npx tsx scripts/remove-duplicates.ts [options]
+  npm run remove-duplicates [-- options]
+
+Options:
+  --dry-run              Show duplicates without removing them (default)
+  --execute              Actually remove the duplicates
+  --batch-size=<number>  Number of items to process in each batch (default: 1000)
+  --field=<field>        Check duplicates only on this field
+                         Valid fields: plaintext, md5, sha1, sha256, sha512
+  --help, -h             Show this help message
+
+Environment Variables:
+  ELASTICSEARCH_NODE     Elasticsearch node URL (default: http://localhost:9200)
+
+Examples:
+  npx tsx scripts/remove-duplicates.ts                    # Dry run, show all duplicates
+  npx tsx scripts/remove-duplicates.ts --execute          # Remove all duplicates
+  npx tsx scripts/remove-duplicates.ts --field=md5        # Check only md5 duplicates
+  npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
+
+Notes:
+  - The script keeps the OLDEST document (by created_at) and removes newer duplicates
+  - Always run with --dry-run first to review what will be deleted
+  - Duplicates are checked across all hash fields by default
+`);
+  process.exit(0);
+}
+
+async function findDuplicatesForField(
+  client: Client, 
+  field: string, 
+  batchSize: number
+): Promise<DuplicateGroup[]> {
+  const duplicates: DuplicateGroup[] = [];
+  
+  // Use aggregation to find duplicate values
+  const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
+  
+  // Use composite aggregation to handle large number of duplicates
+  let afterKey: any = undefined;
+  let hasMore = true;
+  
+  console.log(`   Scanning for duplicates...`);
+  
+  while (hasMore) {
+    const aggQuery: any = {
+      index: INDEX_NAME,
+      size: 0,
+      aggs: {
+        duplicates: {
+          composite: {
+            size: batchSize,
+            sources: [
+              { value: { terms: { field: fieldToAggregate } } }
+            ],
+            ...(afterKey && { after: afterKey })
+          },
+          aggs: {
+            doc_count_filter: {
+              bucket_selector: {
+                buckets_path: { count: '_count' },
+                script: 'params.count > 1'
+              }
+            }
+          }
+        }
+      }
+    };
+
+    const response = await client.search(aggQuery);
+    const compositeAgg = response.aggregations?.duplicates as any;
+    const buckets = compositeAgg?.buckets || [];
+
+    for (const bucket of buckets) {
+      if (bucket.doc_count > 1) {
+        const value = bucket.key.value;
+        
+        // Use scroll API for large result sets
+        const documentIds: string[] = [];
+        
+        let scrollResponse = await client.search({
+          index: INDEX_NAME,
+          scroll: '1m',
+          size: 1000,
+          query: {
+            term: {
+              [fieldToAggregate]: value
+            }
+          },
+          sort: [
+            { created_at: { order: 'asc' } }
+          ],
+          _source: false
+        });
+
+        while (scrollResponse.hits.hits.length > 0) {
+          documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
+          
+          if (!scrollResponse._scroll_id) break;
+          
+          scrollResponse = await client.scroll({
+            scroll_id: scrollResponse._scroll_id,
+            scroll: '1m'
+          });
+        }
+
+        // Clear scroll
+        if (scrollResponse._scroll_id) {
+          await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
+        }
+        
+        if (documentIds.length > 1) {
+          duplicates.push({
+            value: String(value),
+            field,
+            documentIds,
+            keepId: documentIds[0], // Keep the oldest
+            deleteIds: documentIds.slice(1) // Delete the rest
+          });
+        }
+      }
+    }
+
+    // Check if there are more results
+    afterKey = compositeAgg?.after_key;
+    hasMore = buckets.length === batchSize && afterKey;
+    
+    if (hasMore) {
+      process.stdout.write(`\r   Found ${duplicates.length} duplicate groups so far...`);
+    }
+  }
+
+  return duplicates;
+}
+
+/**
+ * Phase 1: Initialize and connect to Elasticsearch
+ */
+async function phase1_InitAndConnect() {
+  console.log(`🔍 Hasher Duplicate Remover - Phase 1: Initialization`);
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+  console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
+  console.log(`Index: ${INDEX_NAME}`);
+  console.log('');
+
+  const client = new Client({ node: ELASTICSEARCH_NODE });
+
+  console.log('🔗 Connecting to Elasticsearch...');
+  await client.cluster.health({});
+  console.log('✅ Connected successfully\n');
+
+  const countResponse = await client.count({ index: INDEX_NAME });
+  console.log(`📊 Total documents in index: ${countResponse.count}\n`);
+
+  return { client, totalDocuments: countResponse.count };
+}
+
+/**
+ * Phase 2: Find duplicates for a specific field
+ */
+async function phase2_FindDuplicatesForField(
+  client: Client,
+  field: string,
+  batchSize: number,
+  seenDeleteIds: Set<string>
+): Promise<{ duplicates: DuplicateGroup[], totalFound: number }> {
+  console.log(`\n🔍 Phase 2: Checking duplicates for field: ${field}...`);
+  
+  const fieldDuplicates = await findDuplicatesForField(client, field, batchSize);
+  const duplicates: DuplicateGroup[] = [];
+  
+  // Filter out already seen delete IDs to avoid counting the same document multiple times
+  for (const dup of fieldDuplicates) {
+    const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
+    if (newDeleteIds.length > 0) {
+      dup.deleteIds = newDeleteIds;
+      newDeleteIds.forEach(id => seenDeleteIds.add(id));
+      duplicates.push(dup);
+    }
+  }
+  
+  console.log(`   Found ${fieldDuplicates.length} duplicate groups for ${field}`);
+  console.log(`   New unique documents to delete: ${duplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0)}`);
+  
+  // Force garbage collection if available
+  if (global.gc) {
+    global.gc();
+    console.log(`   ♻️  Memory freed after processing ${field}`);
+  }
+  
+  return { duplicates, totalFound: fieldDuplicates.length };
+}
+
+/**
+ * Phase 3: Process deletion for a batch of duplicates
+ */
+async function phase3_DeleteBatch(
+  client: Client,
+  deleteIds: string[],
+  batchSize: number,
+  startIndex: number
+): Promise<{ deleted: number, errors: number }> {
+  const batch = deleteIds.slice(startIndex, startIndex + batchSize);
+  let deleted = 0;
+  let errors = 0;
+
+  try {
+    const bulkOperations = batch.flatMap(id => [
+      { delete: { _index: INDEX_NAME, _id: id } }
+    ]);
+
+    const bulkResponse = await client.bulk({
+      operations: bulkOperations,
+      refresh: false
+    });
+
+    if (bulkResponse.errors) {
+      const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
+      errors += errorCount;
+      deleted += batch.length - errorCount;
+    } else {
+      deleted += batch.length;
+    }
+  } catch (error) {
+    console.error(`\n❌ Error deleting batch:`, error);
+    errors += batch.length;
+  }
+
+  // Force garbage collection if available
+  if (global.gc) {
+    global.gc();
+  }
+
+  return { deleted, errors };
+}
+
+/**
+ * Phase 4: Finalize and report results
+ */
+async function phase4_Finalize(
+  client: Client,
+  totalDeleted: number,
+  totalErrors: number,
+  initialDocumentCount: number
+) {
+  console.log('\n\n🔄 Phase 4: Refreshing index...');
+  await client.indices.refresh({ index: INDEX_NAME });
+
+  const newCountResponse = await client.count({ index: INDEX_NAME });
+
+  console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
+  console.log('✅ Duplicate removal complete!');
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+  console.log(`Documents deleted: ${totalDeleted}`);
+  console.log(`Errors: ${totalErrors}`);
+  console.log(`Previous document count: ${initialDocumentCount}`);
+  console.log(`New document count: ${newCountResponse.count}`);
+  console.log('');
+}
+
+async function removeDuplicates(parsedArgs: ParsedArgs) {
+  const fields = parsedArgs.field 
+    ? [parsedArgs.field] 
+    : ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
+
+  console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️  EXECUTE (will delete)'}`);
+  console.log(`Batch size: ${parsedArgs.batchSize}`);
+  console.log(`Fields to check: ${fields.join(', ')}`);
+  console.log('');
+
+  try {
+    // === PHASE 1: Initialize ===
+    const { client, totalDocuments } = await phase1_InitAndConnect();
+    
+    // Force garbage collection after phase 1
+    if (global.gc) {
+      global.gc();
+      console.log('♻️  Memory freed after initialization\n');
+    }
+
+    // === PHASE 2: Find duplicates field by field ===
+    const allDuplicates: DuplicateGroup[] = [];
+    const seenDeleteIds = new Set<string>();
+
+    for (const field of fields) {
+      const { duplicates } = await phase2_FindDuplicatesForField(
+        client,
+        field,
+        parsedArgs.batchSize,
+        seenDeleteIds
+      );
+      allDuplicates.push(...duplicates);
+      
+      // Clear field duplicates to free memory
+      duplicates.length = 0;
+    }
+
+    const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
+
+    console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+    console.log(`📋 Summary:`);
+    console.log(`   Duplicate groups found: ${allDuplicates.length}`);
+    console.log(`   Documents to delete: ${totalToDelete}`);
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+
+    if (allDuplicates.length === 0) {
+      console.log('✨ No duplicates found! Index is clean.\n');
+      return;
+    }
+
+    // Show sample of duplicates
+    console.log(`📝 Sample duplicates (showing first 10):\n`);
+    const samplesToShow = allDuplicates.slice(0, 10);
+    for (const dup of samplesToShow) {
+      const truncatedValue = dup.value.length > 50 
+        ? dup.value.substring(0, 50) + '...' 
+        : dup.value;
+      console.log(`   Field: ${dup.field}`);
+      console.log(`   Value: ${truncatedValue}`);
+      console.log(`   Keep: ${dup.keepId}`);
+      console.log(`   Delete: ${dup.deleteIds.length} document(s)`);
+      console.log('');
+    }
+
+    if (allDuplicates.length > 10) {
+      console.log(`   ... and ${allDuplicates.length - 10} more duplicate groups\n`);
+    }
+
+    if (parsedArgs.dryRun) {
+      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+      console.log(`🔎 DRY RUN - No changes made`);
+      console.log(`   Run with --execute to remove ${totalToDelete} duplicate documents`);
+      console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+      return;
+    }
+
+    // === PHASE 3: Execute deletion in batches ===
+    console.log(`\n🗑️  Phase 3: Removing ${totalToDelete} duplicate documents...\n`);
+
+    let totalDeleted = 0;
+    let totalErrors = 0;
+    const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
+
+    // Clear allDuplicates to free memory
+    allDuplicates.length = 0;
+
+    // Delete in batches with memory management
+    for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
+      const { deleted, errors } = await phase3_DeleteBatch(
+        client,
+        deleteIds,
+        parsedArgs.batchSize,
+        i
+      );
+      
+      totalDeleted += deleted;
+      totalErrors += errors;
+
+      process.stdout.write(
+        `\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - ` +
+        `Deleted: ${totalDeleted}, Errors: ${totalErrors}`
+      );
+    }
+
+    // Clear deleteIds to free memory
+    deleteIds.length = 0;
+    seenDeleteIds.clear();
+
+    // === PHASE 4: Finalize ===
+    await phase4_Finalize(client, totalDeleted, totalErrors, totalDocuments);
+
+  } catch (error) {
+    console.error('\n❌ Error:', error instanceof Error ? error.message : error);
+    process.exit(1);
+  }
+}
+
+// Parse command line arguments
+const args = process.argv.slice(2);
+const parsedArgs = parseArgs(args);
+
+if (parsedArgs.showHelp) {
+  showHelp();
+}
+
+// Validate field if provided
+const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
+if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
+  console.error(`❌ Invalid field: ${parsedArgs.field}`);
+  console.error(`   Valid fields: ${validFields.join(', ')}`);
+  process.exit(1);
+}
+
+console.log(`\n🔧 Configuration:`);
+console.log(`   Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
+console.log(`   Batch size: ${parsedArgs.batchSize}`);
+if (parsedArgs.field) {
+  console.log(`   Field: ${parsedArgs.field}`);
+} else {
+  console.log(`   Fields: all (plaintext, md5, sha1, sha256, sha512)`);
+}
+console.log('');
+
+removeDuplicates(parsedArgs).catch(console.error);
Autor	SHA1	Mensaje	Fecha
ale	b91d19dc0b	fix memory remove dup Signed-off-by: ale <ale@manalejandro.com>	2025-12-21 22:36:31 +01:00
ale	da89037125	out useRouter Signed-off-by: ale <ale@manalejandro.com>	2025-12-11 00:46:59 +01:00
ale	20f0503134	fix share link Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 23:11:25 +01:00
ale	42bc5a15d0	sanitize nosql Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 23:08:38 +01:00
ale	2de78b7461	share link Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 23:08:24 +01:00
ale	8fa586731a	out bcrypt Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 21:06:35 +01:00
ale	ad7a1cf0a7	scroll api Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 20:58:02 +01:00
ale	459cdcd9bc	remove-duplicates Signed-off-by: ale <ale@manalejandro.com>	2025-12-08 20:56:04 +01:00
ale	9c0c30e846	show stats Signed-off-by: ale <ale@manalejandro.com>	2025-12-07 01:30:51 +01:00
ale	179e192e82	script --no-check Signed-off-by: ale <ale@manalejandro.com>	2025-12-07 01:28:37 +01:00